Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /mm
download: linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2
37 files changed, 28187 insertions, 0 deletions
diff --git a/mm/Makefile b/mm/Makefile
new file mode 100644
index 000000000000..097408064f6a
--- /dev/null
+++ b/mm/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the linux memory manager.
+#
+
+mmu-y			:= nommu.o
+mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
+			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
+			   vmalloc.o
+
+obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+			   page_alloc.o page-writeback.o pdflush.o \
+			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   prio_tree.o $(mmu-y)
+
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_NUMA) 	+= mempolicy.o
+obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+
diff --git a/mm/bootmem.c b/mm/bootmem.c
new file mode 100644
index 000000000000..260e703850d8
--- /dev/null
+++ b/mm/bootmem.c
@@ -0,0 +1,400 @@
+/*
+ *  linux/mm/bootmem.c
+ *
+ *  Copyright (C) 1999 Ingo Molnar
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *
+ *  simple boot-time physical memory area allocator and
+ *  free memory collector. It's used to deal with reserved
+ *  system memory and memory holes as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+#include <asm/io.h>
+#include "internal.h"
+
+/*
+ * Access to this subsystem has to be serialized externally. (this is
+ * true for the boot process anyway)
+ */
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+EXPORT_SYMBOL(max_pfn);		/* This is exported so
+				 * dma_get_required_mask(), which uses
+				 * it, can be an inline function */
+
+/* return the number of _pages_ that will be allocated for the boot bitmap */
+unsigned long __init bootmem_bootmap_pages (unsigned long pages)
+{
+	unsigned long mapsize;
+
+	mapsize = (pages+7)/8;
+	mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
+	mapsize >>= PAGE_SHIFT;
+
+	return mapsize;
+}
+
+/*
+ * Called once to set up the allocator itself.
+ */
+static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
+	unsigned long mapstart, unsigned long start, unsigned long end)
+{
+	bootmem_data_t *bdata = pgdat->bdata;
+	unsigned long mapsize = ((end - start)+7)/8;
+
+	pgdat->pgdat_next = pgdat_list;
+	pgdat_list = pgdat;
+
+	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
+	bdata->node_boot_start = (start << PAGE_SHIFT);
+	bdata->node_low_pfn = end;
+
+	/*
+	 * Initially all pages are reserved - setup_arch() has to
+	 * register free RAM areas explicitly.
+	 */
+	memset(bdata->node_bootmem_map, 0xff, mapsize);
+
+	return mapsize;
+}
+
+/*
+ * Marks a particular physical memory range as unallocatable. Usable RAM
+ * might be used for boot-time allocations - or it might get added
+ * to the free page pool later on.
+ */
+static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+{
+	unsigned long i;
+	/*
+	 * round up, partially reserved pages are considered
+	 * fully reserved.
+	 */
+	unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
+	unsigned long eidx = (addr + size - bdata->node_boot_start + 
+							PAGE_SIZE-1)/PAGE_SIZE;
+	unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
+
+	BUG_ON(!size);
+	BUG_ON(sidx >= eidx);
+	BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn);
+	BUG_ON(end > bdata->node_low_pfn);
+
+	for (i = sidx; i < eidx; i++)
+		if (test_and_set_bit(i, bdata->node_bootmem_map)) {
+#ifdef CONFIG_DEBUG_BOOTMEM
+			printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
+#endif
+		}
+}
+
+static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+{
+	unsigned long i;
+	unsigned long start;
+	/*
+	 * round down end of usable mem, partially free pages are
+	 * considered reserved.
+	 */
+	unsigned long sidx;
+	unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
+	unsigned long end = (addr + size)/PAGE_SIZE;
+
+	BUG_ON(!size);
+	BUG_ON(end > bdata->node_low_pfn);
+
+	if (addr < bdata->last_success)
+		bdata->last_success = addr;
+
+	/*
+	 * Round up the beginning of the address.
+	 */
+	start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
+	sidx = start - (bdata->node_boot_start/PAGE_SIZE);
+
+	for (i = sidx; i < eidx; i++) {
+		if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
+			BUG();
+	}
+}
+
+/*
+ * We 'merge' subsequent allocations to save space. We might 'lose'
+ * some fraction of a page if allocations cannot be satisfied due to
+ * size constraints on boxes where there is physical RAM space
+ * fragmentation - in these cases (mostly large memory boxes) this
+ * is not a problem.
+ *
+ * On low memory boxes we get it right in 100% of the cases.
+ *
+ * alignment has to be a power of 2 value.
+ *
+ * NOTE:  This function is _not_ reentrant.
+ */
+static void * __init
+__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
+		unsigned long align, unsigned long goal)
+{
+	unsigned long offset, remaining_size, areasize, preferred;
+	unsigned long i, start = 0, incr, eidx;
+	void *ret;
+
+	if(!size) {
+		printk("__alloc_bootmem_core(): zero-sized request\n");
+		BUG();
+	}
+	BUG_ON(align & (align-1));
+
+	eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	offset = 0;
+	if (align &&
+	    (bdata->node_boot_start & (align - 1UL)) != 0)
+		offset = (align - (bdata->node_boot_start & (align - 1UL)));
+	offset >>= PAGE_SHIFT;
+
+	/*
+	 * We try to allocate bootmem pages above 'goal'
+	 * first, then we try to allocate lower pages.
+	 */
+	if (goal && (goal >= bdata->node_boot_start) && 
+	    ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
+		preferred = goal - bdata->node_boot_start;
+
+		if (bdata->last_success >= preferred)
+			preferred = bdata->last_success;
+	} else
+		preferred = 0;
+
+	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+	preferred += offset;
+	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
+	incr = align >> PAGE_SHIFT ? : 1;
+
+restart_scan:
+	for (i = preferred; i < eidx; i += incr) {
+		unsigned long j;
+		i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
+		i = ALIGN(i, incr);
+		if (test_bit(i, bdata->node_bootmem_map))
+			continue;
+		for (j = i + 1; j < i + areasize; ++j) {
+			if (j >= eidx)
+				goto fail_block;
+			if (test_bit (j, bdata->node_bootmem_map))
+				goto fail_block;
+		}
+		start = i;
+		goto found;
+	fail_block:
+		i = ALIGN(j, incr);
+	}
+
+	if (preferred > offset) {
+		preferred = offset;
+		goto restart_scan;
+	}
+	return NULL;
+
+found:
+	bdata->last_success = start << PAGE_SHIFT;
+	BUG_ON(start >= eidx);
+
+	/*
+	 * Is the next page of the previous allocation-end the start
+	 * of this allocation's buffer? If yes then we can 'merge'
+	 * the previous partial page with this allocation.
+	 */
+	if (align < PAGE_SIZE &&
+	    bdata->last_offset && bdata->last_pos+1 == start) {
+		offset = (bdata->last_offset+align-1) & ~(align-1);
+		BUG_ON(offset > PAGE_SIZE);
+		remaining_size = PAGE_SIZE-offset;
+		if (size < remaining_size) {
+			areasize = 0;
+			/* last_pos unchanged */
+			bdata->last_offset = offset+size;
+			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
+						bdata->node_boot_start);
+		} else {
+			remaining_size = size - remaining_size;
+			areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
+			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
+						bdata->node_boot_start);
+			bdata->last_pos = start+areasize-1;
+			bdata->last_offset = remaining_size;
+		}
+		bdata->last_offset &= ~PAGE_MASK;
+	} else {
+		bdata->last_pos = start + areasize - 1;
+		bdata->last_offset = size & ~PAGE_MASK;
+		ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+	}
+
+	/*
+	 * Reserve the area now:
+	 */
+	for (i = start; i < start+areasize; i++)
+		if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+			BUG();
+	memset(ret, 0, size);
+	return ret;
+}
+
+static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+{
+	struct page *page;
+	bootmem_data_t *bdata = pgdat->bdata;
+	unsigned long i, count, total = 0;
+	unsigned long idx;
+	unsigned long *map; 
+	int gofast = 0;
+
+	BUG_ON(!bdata->node_bootmem_map);
+
+	count = 0;
+	/* first extant page of the node */
+	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	map = bdata->node_bootmem_map;
+	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
+	if (bdata->node_boot_start == 0 ||
+	    ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
+		gofast = 1;
+	for (i = 0; i < idx; ) {
+		unsigned long v = ~map[i / BITS_PER_LONG];
+		if (gofast && v == ~0UL) {
+			int j, order;
+
+			count += BITS_PER_LONG;
+			__ClearPageReserved(page);
+			order = ffs(BITS_PER_LONG) - 1;
+			set_page_refs(page, order);
+			for (j = 1; j < BITS_PER_LONG; j++) {
+				if (j + 16 < BITS_PER_LONG)
+					prefetchw(page + j + 16);
+				__ClearPageReserved(page + j);
+			}
+			__free_pages(page, order);
+			i += BITS_PER_LONG;
+			page += BITS_PER_LONG;
+		} else if (v) {
+			unsigned long m;
+			for (m = 1; m && i < idx; m<<=1, page++, i++) {
+				if (v & m) {
+					count++;
+					__ClearPageReserved(page);
+					set_page_refs(page, 0);
+					__free_page(page);
+				}
+			}
+		} else {
+			i+=BITS_PER_LONG;
+			page += BITS_PER_LONG;
+		}
+	}
+	total += count;
+
+	/*
+	 * Now free the allocator bitmap itself, it's not
+	 * needed anymore:
+	 */
+	page = virt_to_page(bdata->node_bootmem_map);
+	count = 0;
+	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
+		count++;
+		__ClearPageReserved(page);
+		set_page_count(page, 1);
+		__free_page(page);
+	}
+	total += count;
+	bdata->node_bootmem_map = NULL;
+
+	return total;
+}
+
+unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+{
+	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
+}
+
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+{
+	reserve_bootmem_core(pgdat->bdata, physaddr, size);
+}
+
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+{
+	free_bootmem_core(pgdat->bdata, physaddr, size);
+}
+
+unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
+{
+	return(free_all_bootmem_core(pgdat));
+}
+
+unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
+{
+	max_low_pfn = pages;
+	min_low_pfn = start;
+	return(init_bootmem_core(NODE_DATA(0), start, 0, pages));
+}
+
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+void __init reserve_bootmem (unsigned long addr, unsigned long size)
+{
+	reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+}
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+
+void __init free_bootmem (unsigned long addr, unsigned long size)
+{
+	free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+}
+
+unsigned long __init free_all_bootmem (void)
+{
+	return(free_all_bootmem_core(NODE_DATA(0)));
+}
+
+void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+{
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	for_each_pgdat(pgdat)
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						align, goal)))
+			return(ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
+}
+
+void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+	if (ptr)
+		return (ptr);
+
+	return __alloc_bootmem(size, align, goal);
+}
+
diff --git a/mm/fadvise.c b/mm/fadvise.c
new file mode 100644
index 000000000000..57264d74b8bf
--- /dev/null
+++ b/mm/fadvise.c
@@ -0,0 +1,111 @@
+/*
+ * mm/fadvise.c
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 11Jan2003	akpm@digeo.com
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/fadvise.h>
+#include <linux/syscalls.h>
+
+#include <asm/unistd.h>
+
+/*
+ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
+ * deactivate the pages and clear PG_Referenced.
+ */
+asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+{
+	struct file *file = fget(fd);
+	struct address_space *mapping;
+	struct backing_dev_info *bdi;
+	loff_t endbyte;
+	pgoff_t start_index;
+	pgoff_t end_index;
+	unsigned long nrpages;
+	int ret = 0;
+
+	if (!file)
+		return -EBADF;
+
+	mapping = file->f_mapping;
+	if (!mapping || len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Careful about overflows. Len == 0 means "as much as possible" */
+	endbyte = offset + len;
+	if (!len || endbyte < len)
+		endbyte = -1;
+
+	bdi = mapping->backing_dev_info;
+
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+		file->f_ra.ra_pages = bdi->ra_pages;
+		break;
+	case POSIX_FADV_RANDOM:
+		file->f_ra.ra_pages = 0;
+		break;
+	case POSIX_FADV_SEQUENTIAL:
+		file->f_ra.ra_pages = bdi->ra_pages * 2;
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_NOREUSE:
+		if (!mapping->a_ops->readpage) {
+			ret = -EINVAL;
+			break;
+		}
+
+		/* First and last PARTIAL page! */
+		start_index = offset >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+
+		/* Careful about overflow on the "+1" */
+		nrpages = end_index - start_index + 1;
+		if (!nrpages)
+			nrpages = ~0UL;
+		
+		ret = force_page_cache_readahead(mapping, file,
+				start_index,
+				max_sane_readahead(nrpages));
+		if (ret > 0)
+			ret = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		if (!bdi_write_congested(mapping->backing_dev_info))
+			filemap_flush(mapping);
+
+		/* First and last FULL page! */
+		start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte >> PAGE_CACHE_SHIFT);
+
+		if (end_index > start_index)
+			invalidate_mapping_pages(mapping, start_index, end_index-1);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	fput(file);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_FADVISE64
+
+asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
+{
+	return sys_fadvise64_64(fd, offset, len, advice);
+}
+
+#endif
diff --git a/mm/filemap.c b/mm/filemap.c
new file mode 100644
index 000000000000..439b2bea8e34
--- /dev/null
+++ b/mm/filemap.c
@@ -0,0 +1,2306 @@
+/*
+ *	linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/hash.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+/*
+ * This is needed for the following functions:
+ *  - try_to_release_page
+ *  - block_invalidatepage
+ *  - generic_osync_inode
+ *
+ * FIXME: remove all knowledge of the buffer layer from the core VM
+ */
+#include <linux/buffer_head.h> /* for generic_osync_inode */
+
+#include <asm/uaccess.h>
+#include <asm/mman.h>
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995  Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+/*
+ * Lock ordering:
+ *
+ *  ->i_mmap_lock		(vmtruncate)
+ *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
+ *      ->swap_list_lock
+ *        ->swap_device_lock	(exclusive_swap_page, others)
+ *          ->mapping->tree_lock
+ *
+ *  ->i_sem
+ *    ->i_mmap_lock		(truncate->unmap_mapping_range)
+ *
+ *  ->mmap_sem
+ *    ->i_mmap_lock
+ *      ->page_table_lock	(various places, mainly in mmap.c)
+ *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
+ *
+ *  ->mmap_sem
+ *    ->lock_page		(access_process_vm)
+ *
+ *  ->mmap_sem
+ *    ->i_sem			(msync)
+ *
+ *  ->i_sem
+ *    ->i_alloc_sem             (various)
+ *
+ *  ->inode_lock
+ *    ->sb_lock			(fs/fs-writeback.c)
+ *    ->mapping->tree_lock	(__sync_single_inode)
+ *
+ *  ->i_mmap_lock
+ *    ->anon_vma.lock		(vma_adjust)
+ *
+ *  ->anon_vma.lock
+ *    ->page_table_lock		(anon_vma_prepare and various)
+ *
+ *  ->page_table_lock
+ *    ->swap_device_lock	(try_to_unmap_one)
+ *    ->private_lock		(try_to_unmap_one)
+ *    ->tree_lock		(try_to_unmap_one)
+ *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->private_lock		(page_remove_rmap->set_page_dirty)
+ *    ->tree_lock		(page_remove_rmap->set_page_dirty)
+ *    ->inode_lock		(page_remove_rmap->set_page_dirty)
+ *    ->inode_lock		(zap_pte_range->set_page_dirty)
+ *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
+ *
+ *  ->task->proc_lock
+ *    ->dcache_lock		(proc_pid_lookup)
+ */
+
+/*
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ */
+void __remove_from_page_cache(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	radix_tree_delete(&mapping->page_tree, page->index);
+	page->mapping = NULL;
+	mapping->nrpages--;
+	pagecache_acct(-1);
+}
+
+void remove_from_page_cache(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (unlikely(!PageLocked(page)))
+		PAGE_BUG(page);
+
+	write_lock_irq(&mapping->tree_lock);
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+}
+
+static int sync_page(void *word)
+{
+	struct address_space *mapping;
+	struct page *page;
+
+	page = container_of((page_flags_t *)word, struct page, flags);
+
+	/*
+	 * FIXME, fercrissake.  What is this barrier here for?
+	 */
+	smp_mb();
+	mapping = page_mapping(page);
+	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+		mapping->a_ops->sync_page(page);
+	io_schedule();
+	return 0;
+}
+
+/**
+ * filemap_fdatawrite_range - start writeback against all of a mapping's
+ * dirty pages that lie within the byte offsets <start, end>
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end : offset in bytes where the range ends
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory * cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+static int __filemap_fdatawrite_range(struct address_space *mapping,
+	loff_t start, loff_t end, int sync_mode)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode = sync_mode,
+		.nr_to_write = mapping->nrpages * 2,
+		.start = start,
+		.end = end,
+	};
+
+	if (!mapping_cap_writeback_dirty(mapping))
+		return 0;
+
+	ret = do_writepages(mapping, &wbc);
+	return ret;
+}
+
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+	int sync_mode)
+{
+	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+}
+
+int filemap_fdatawrite(struct address_space *mapping)
+{
+	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite);
+
+static int filemap_fdatawrite_range(struct address_space *mapping,
+	loff_t start, loff_t end)
+{
+	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+
+/*
+ * This is a mostly non-blocking flush.  Not suitable for data-integrity
+ * purposes - I/O may not be started against all dirty pages.
+ */
+int filemap_flush(struct address_space *mapping)
+{
+	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL(filemap_flush);
+
+/*
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+static int wait_on_page_writeback_range(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	int nr_pages;
+	int ret = 0;
+	pgoff_t index;
+
+	if (end < start)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end)
+				continue;
+
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	/* Check for outstanding write errors */
+	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+		ret = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
+
+/*
+ * Write and wait upon all the pages in the passed range.  This is a "data
+ * integrity" operation.  It waits upon in-flight writeout before starting and
+ * waiting upon new writeout.  If there was an IO error, return it.
+ *
+ * We need to re-take i_sem during the generic_osync_inode list walk because
+ * it is otherwise livelockable.
+ */
+int sync_page_range(struct inode *inode, struct address_space *mapping,
+			loff_t pos, size_t count)
+{
+	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!mapping_cap_writeback_dirty(mapping) || !count)
+		return 0;
+	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+	if (ret == 0) {
+		down(&inode->i_sem);
+		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+		up(&inode->i_sem);
+	}
+	if (ret == 0)
+		ret = wait_on_page_writeback_range(mapping, start, end);
+	return ret;
+}
+EXPORT_SYMBOL(sync_page_range);
+
+/*
+ * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * as it forces O_SYNC writers to different parts of the same file
+ * to be serialised right until io completion.
+ */
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			loff_t pos, size_t count)
+{
+	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!mapping_cap_writeback_dirty(mapping) || !count)
+		return 0;
+	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+	if (ret == 0)
+		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+	if (ret == 0)
+		ret = wait_on_page_writeback_range(mapping, start, end);
+	return ret;
+}
+EXPORT_SYMBOL(sync_page_range_nolock);
+
+/**
+ * filemap_fdatawait - walk the list of under-writeback pages of the given
+ *     address space and wait for all of them.
+ *
+ * @mapping: address space structure to wait for
+ */
+int filemap_fdatawait(struct address_space *mapping)
+{
+	loff_t i_size = i_size_read(mapping->host);
+
+	if (i_size == 0)
+		return 0;
+
+	return wait_on_page_writeback_range(mapping, 0,
+				(i_size - 1) >> PAGE_CACHE_SHIFT);
+}
+EXPORT_SYMBOL(filemap_fdatawait);
+
+int filemap_write_and_wait(struct address_space *mapping)
+{
+	int retval = 0;
+
+	if (mapping->nrpages) {
+		retval = filemap_fdatawrite(mapping);
+		if (retval == 0)
+			retval = filemap_fdatawait(mapping);
+	}
+	return retval;
+}
+
+int filemap_write_and_wait_range(struct address_space *mapping,
+				 loff_t lstart, loff_t lend)
+{
+	int retval = 0;
+
+	if (mapping->nrpages) {
+		retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+						    WB_SYNC_ALL);
+		if (retval == 0)
+			retval = wait_on_page_writeback_range(mapping,
+						    lstart >> PAGE_CACHE_SHIFT,
+						    lend >> PAGE_CACHE_SHIFT);
+	}
+	return retval;
+}
+
+/*
+ * This function is used to add newly allocated pagecache pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
+ *
+ * This function does not add the page to the LRU.  The caller must do that.
+ */
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+		pgoff_t offset, int gfp_mask)
+{
+	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+	if (error == 0) {
+		write_lock_irq(&mapping->tree_lock);
+		error = radix_tree_insert(&mapping->page_tree, offset, page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->mapping = mapping;
+			page->index = offset;
+			mapping->nrpages++;
+			pagecache_acct(1);
+		}
+		write_unlock_irq(&mapping->tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+
+EXPORT_SYMBOL(add_to_page_cache);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+				pgoff_t offset, int gfp_mask)
+{
+	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0)
+		lru_cache_add(page);
+	return ret;
+}
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+static wait_queue_head_t *page_waitqueue(struct page *page)
+{
+	const struct zone *zone = page_zone(page);
+
+	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+}
+
+static inline void wake_up_page(struct page *page, int bit)
+{
+	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
+}
+
+void fastcall wait_on_page_bit(struct page *page, int bit_nr)
+{
+	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+	if (test_bit(bit_nr, &page->flags))
+		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+/**
+ * unlock_page() - unlock a locked page
+ *
+ * @page: the page
+ *
+ * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Also wakes sleepers in wait_on_page_writeback() because the wakeup
+ * mechananism between PageLocked pages and PageWriteback pages is shared.
+ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ *
+ * The first mb is necessary to safely close the critical section opened by the
+ * TestSetPageLocked(), the second mb is necessary to enforce ordering between
+ * the clear_bit and the read of the waitqueue (to avoid SMP races with a
+ * parallel wait_on_page_locked()).
+ */
+void fastcall unlock_page(struct page *page)
+{
+	smp_mb__before_clear_bit();
+	if (!TestClearPageLocked(page))
+		BUG();
+	smp_mb__after_clear_bit(); 
+	wake_up_page(page, PG_locked);
+}
+EXPORT_SYMBOL(unlock_page);
+
+/*
+ * End writeback against a page.
+ */
+void end_page_writeback(struct page *page)
+{
+	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
+		if (!test_clear_page_writeback(page))
+			BUG();
+	}
+	smp_mb__after_clear_bit();
+	wake_up_page(page, PG_writeback);
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+/*
+ * Get a lock on the page, assuming we need to sleep to get it.
+ *
+ * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
+ * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
+ * chances are that on the second loop, the block layer's plug list is empty,
+ * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
+ */
+void fastcall __lock_page(struct page *page)
+{
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_page);
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically.
+ */
+struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page *page;
+
+	read_lock_irq(&mapping->tree_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	if (page)
+		page_cache_get(page);
+	read_unlock_irq(&mapping->tree_lock);
+	return page;
+}
+
+EXPORT_SYMBOL(find_get_page);
+
+/*
+ * Same as above, but trylock it instead of incrementing the count.
+ */
+struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page *page;
+
+	read_lock_irq(&mapping->tree_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	if (page && TestSetPageLocked(page))
+		page = NULL;
+	read_unlock_irq(&mapping->tree_lock);
+	return page;
+}
+
+EXPORT_SYMBOL(find_trylock_page);
+
+/**
+ * find_lock_page - locate, pin and lock a pagecache page
+ *
+ * @mapping - the address_space to search
+ * @offset - the page index
+ *
+ * Locates the desired pagecache page, locks it, increments its reference
+ * count and returns its address.
+ *
+ * Returns zero if the page was not present. find_lock_page() may sleep.
+ */
+struct page *find_lock_page(struct address_space *mapping,
+				unsigned long offset)
+{
+	struct page *page;
+
+	read_lock_irq(&mapping->tree_lock);
+repeat:
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	if (page) {
+		page_cache_get(page);
+		if (TestSetPageLocked(page)) {
+			read_unlock_irq(&mapping->tree_lock);
+			lock_page(page);
+			read_lock_irq(&mapping->tree_lock);
+
+			/* Has the page been truncated while we slept? */
+			if (page->mapping != mapping || page->index != offset) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto repeat;
+			}
+		}
+	}
+	read_unlock_irq(&mapping->tree_lock);
+	return page;
+}
+
+EXPORT_SYMBOL(find_lock_page);
+
+/**
+ * find_or_create_page - locate or add a pagecache page
+ *
+ * @mapping - the page's address_space
+ * @index - the page's index into the mapping
+ * @gfp_mask - page allocation mode
+ *
+ * Locates a page in the pagecache.  If the page is not present, a new page
+ * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+ * LRU list.  The returned page is locked and has its reference count
+ * incremented.
+ *
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+ * allocation!
+ *
+ * find_or_create_page() returns the desired page's address, or zero on
+ * memory exhaustion.
+ */
+struct page *find_or_create_page(struct address_space *mapping,
+		unsigned long index, unsigned int gfp_mask)
+{
+	struct page *page, *cached_page = NULL;
+	int err;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		if (!cached_page) {
+			cached_page = alloc_page(gfp_mask);
+			if (!cached_page)
+				return NULL;
+		}
+		err = add_to_page_cache_lru(cached_page, mapping,
+					index, gfp_mask);
+		if (!err) {
+			page = cached_page;
+			cached_page = NULL;
+		} else if (err == -EEXIST)
+			goto repeat;
+	}
+	if (cached_page)
+		page_cache_release(cached_page);
+	return page;
+}
+
+EXPORT_SYMBOL(find_or_create_page);
+
+/**
+ * find_get_pages - gang pagecache lookup
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ * @pages:	Where the resulting pages are placed
+ *
+ * find_get_pages() will search for and return a group of up to
+ * @nr_pages pages in the mapping.  The pages are placed at @pages.
+ * find_get_pages() takes a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * find_get_pages() returns the number of pages which were found.
+ */
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+			    unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, start, nr_pages);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	read_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+/*
+ * Like find_get_pages, except we only return pages which are tagged with
+ * `tag'.   We update *index to index the next page for the traversal.
+ */
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+			int tag, unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+				(void **)pages, *index, nr_pages, tag);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	if (ret)
+		*index = pages[ret - 1]->index + 1;
+	read_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+/*
+ * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed.  This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+struct page *
+grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+{
+	struct page *page = find_get_page(mapping, index);
+	unsigned int gfp_mask;
+
+	if (page) {
+		if (!TestSetPageLocked(page))
+			return page;
+		page_cache_release(page);
+		return NULL;
+	}
+	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
+	page = alloc_pages(gfp_mask, 0);
+	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
+		page_cache_release(page);
+		page = NULL;
+	}
+	return page;
+}
+
+EXPORT_SYMBOL(grab_cache_page_nowait);
+
+/*
+ * This is a generic file read routine, and uses the
+ * mapping->a_ops->readpage() function for the actual low-level
+ * stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ *
+ * Note the struct file* is only passed for the use of readpage.  It may be
+ * NULL.
+ */
+void do_generic_mapping_read(struct address_space *mapping,
+			     struct file_ra_state *_ra,
+			     struct file *filp,
+			     loff_t *ppos,
+			     read_descriptor_t *desc,
+			     read_actor_t actor)
+{
+	struct inode *inode = mapping->host;
+	unsigned long index;
+	unsigned long end_index;
+	unsigned long offset;
+	unsigned long last_index;
+	unsigned long next_index;
+	unsigned long prev_index;
+	loff_t isize;
+	struct page *cached_page;
+	int error;
+	struct file_ra_state ra = *_ra;
+
+	cached_page = NULL;
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	next_index = index;
+	prev_index = ra.prev_page;
+	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		goto out;
+
+	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	for (;;) {
+		struct page *page;
+		unsigned long nr, ret;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = PAGE_CACHE_SIZE;
+		if (index >= end_index) {
+			if (index > end_index)
+				goto out;
+			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (nr <= offset) {
+				goto out;
+			}
+		}
+		nr = nr - offset;
+
+		cond_resched();
+		if (index == next_index)
+			next_index = page_cache_readahead(mapping, &ra, filp,
+					index, last_index - index);
+
+find_page:
+		page = find_get_page(mapping, index);
+		if (unlikely(page == NULL)) {
+			handle_ra_miss(mapping, &ra, index);
+			goto no_cached_page;
+		}
+		if (!PageUptodate(page))
+			goto page_not_up_to_date;
+page_ok:
+
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		/*
+		 * When (part of) the same page is read multiple times
+		 * in succession, only mark it as accessed the first time.
+		 */
+		if (prev_index != index)
+			mark_page_accessed(page);
+		prev_index = index;
+
+		/*
+		 * Ok, we have the page, and it's up-to-date, so
+		 * now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		ret = actor(desc, page, offset, nr);
+		offset += ret;
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+
+		page_cache_release(page);
+		if (ret == nr && desc->count)
+			continue;
+		goto out;
+
+page_not_up_to_date:
+		/* Get exclusive access to the page ... */
+		lock_page(page);
+
+		/* Did it get unhashed before we got the lock? */
+		if (!page->mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		/* Did somebody else fill it already? */
+		if (PageUptodate(page)) {
+			unlock_page(page);
+			goto page_ok;
+		}
+
+readpage:
+		/* Start the actual read. The read will unlock the page. */
+		error = mapping->a_ops->readpage(filp, page);
+
+		if (unlikely(error))
+			goto readpage_error;
+
+		if (!PageUptodate(page)) {
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				if (page->mapping == NULL) {
+					/*
+					 * invalidate_inode_pages got it
+					 */
+					unlock_page(page);
+					page_cache_release(page);
+					goto find_page;
+				}
+				unlock_page(page);
+				error = -EIO;
+				goto readpage_error;
+			}
+			unlock_page(page);
+		}
+
+		/*
+		 * i_size must be checked after we have done ->readpage.
+		 *
+		 * Checking i_size after the readpage allows us to calculate
+		 * the correct value for "nr", which means the zero-filled
+		 * part of the page is not copied back to userspace (unless
+		 * another truncate extends the file - this is desired though).
+		 */
+		isize = i_size_read(inode);
+		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		if (unlikely(!isize || index > end_index)) {
+			page_cache_release(page);
+			goto out;
+		}
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = PAGE_CACHE_SIZE;
+		if (index == end_index) {
+			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (nr <= offset) {
+				page_cache_release(page);
+				goto out;
+			}
+		}
+		nr = nr - offset;
+		goto page_ok;
+
+readpage_error:
+		/* UHHUH! A synchronous read error occurred. Report it */
+		desc->error = error;
+		page_cache_release(page);
+		goto out;
+
+no_cached_page:
+		/*
+		 * Ok, it wasn't cached, so we need to create a new
+		 * page..
+		 */
+		if (!cached_page) {
+			cached_page = page_cache_alloc_cold(mapping);
+			if (!cached_page) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+		}
+		error = add_to_page_cache_lru(cached_page, mapping,
+						index, GFP_KERNEL);
+		if (error) {
+			if (error == -EEXIST)
+				goto find_page;
+			desc->error = error;
+			goto out;
+		}
+		page = cached_page;
+		cached_page = NULL;
+		goto readpage;
+	}
+
+out:
+	*_ra = ra;
+
+	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	if (cached_page)
+		page_cache_release(cached_page);
+	if (filp)
+		file_accessed(filp);
+}
+
+EXPORT_SYMBOL(do_generic_mapping_read);
+
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+			unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, count = desc->count;
+
+	if (size > count)
+		size = count;
+
+	/*
+	 * Faults on the destination of a read are common, so do it before
+	 * taking the kmap.
+	 */
+	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		left = __copy_to_user_inatomic(desc->arg.buf,
+						kaddr + offset, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (left == 0)
+			goto success;
+	}
+
+	/* Do it the slow way */
+	kaddr = kmap(page);
+	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+	kunmap(page);
+
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+success:
+	desc->count = count - size;
+	desc->written += size;
+	desc->arg.buf += size;
+	return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *filp = iocb->ki_filp;
+	ssize_t retval;
+	unsigned long seg;
+	size_t count;
+
+	count = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		count += iv->iov_len;
+		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		count -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+	if (filp->f_flags & O_DIRECT) {
+		loff_t pos = *ppos, size;
+		struct address_space *mapping;
+		struct inode *inode;
+
+		mapping = filp->f_mapping;
+		inode = mapping->host;
+		retval = 0;
+		if (!count)
+			goto out; /* skip atime */
+		size = i_size_read(inode);
+		if (pos < size) {
+			retval = generic_file_direct_IO(READ, iocb,
+						iov, pos, nr_segs);
+			if (retval >= 0 && !is_sync_kiocb(iocb))
+				retval = -EIOCBQUEUED;
+			if (retval > 0)
+				*ppos = pos + retval;
+		}
+		file_accessed(filp);
+		goto out;
+	}
+
+	retval = 0;
+	if (count) {
+		for (seg = 0; seg < nr_segs; seg++) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.arg.buf = iov[seg].iov_base;
+			desc.count = iov[seg].iov_len;
+			if (desc.count == 0)
+				continue;
+			desc.error = 0;
+			do_generic_file_read(filp,ppos,&desc,file_read_actor);
+			retval += desc.written;
+			if (!retval) {
+				retval = desc.error;
+				break;
+			}
+		}
+	}
+out:
+	return retval;
+}
+
+EXPORT_SYMBOL(__generic_file_aio_read);
+
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
+
+	BUG_ON(iocb->ki_pos != pos);
+	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+}
+
+EXPORT_SYMBOL(generic_file_aio_read);
+
+ssize_t
+generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_read);
+
+int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+{
+	ssize_t written;
+	unsigned long count = desc->count;
+	struct file *file = desc->arg.data;
+
+	if (size > count)
+		size = count;
+
+	written = file->f_op->sendpage(file, page, offset,
+				       size, &file->f_pos, size<count);
+	if (written < 0) {
+		desc->error = written;
+		written = 0;
+	}
+	desc->count = count - written;
+	desc->written += written;
+	return written;
+}
+
+ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
+			 size_t count, read_actor_t actor, void *target)
+{
+	read_descriptor_t desc;
+
+	if (!count)
+		return 0;
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+
+	do_generic_file_read(in_file, ppos, &desc, actor);
+	if (desc.written)
+		return desc.written;
+	return desc.error;
+}
+
+EXPORT_SYMBOL(generic_file_sendfile);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+	     unsigned long index, unsigned long nr)
+{
+	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+		return -EINVAL;
+
+	force_page_cache_readahead(mapping, filp, index,
+					max_sane_readahead(nr));
+	return 0;
+}
+
+asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+{
+	ssize_t ret;
+	struct file *file;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (file) {
+		if (file->f_mode & FMODE_READ) {
+			struct address_space *mapping = file->f_mapping;
+			unsigned long start = offset >> PAGE_CACHE_SHIFT;
+			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+			unsigned long len = end - start + 1;
+			ret = do_readahead(mapping, file, start, len);
+		}
+		fput(file);
+	}
+	return ret;
+}
+
+#ifdef CONFIG_MMU
+/*
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int fastcall page_cache_read(struct file * file, unsigned long offset)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct page *page; 
+	int error;
+
+	page = page_cache_alloc_cold(mapping);
+	if (!page)
+		return -ENOMEM;
+
+	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+	if (!error) {
+		error = mapping->a_ops->readpage(file, page);
+		page_cache_release(page);
+		return error;
+	}
+
+	/*
+	 * We arrive here in the unlikely event that someone 
+	 * raced with us and added our page to the cache first
+	 * or we are out of memory for radix-tree nodes.
+	 */
+	page_cache_release(page);
+	return error == -EEXIST ? 0 : error;
+}
+
+#define MMAP_LOTSAMISS  (100)
+
+/*
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+struct page *filemap_nopage(struct vm_area_struct *area,
+				unsigned long address, int *type)
+{
+	int error;
+	struct file *file = area->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct file_ra_state *ra = &file->f_ra;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long size, pgoff;
+	int did_readaround = 0, majmin = VM_FAULT_MINOR;
+
+	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+
+retry_all:
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (pgoff >= size)
+		goto outside_data_content;
+
+	/* If we don't want any read-ahead, don't bother */
+	if (VM_RandomReadHint(area))
+		goto no_cached_page;
+
+	/*
+	 * The readahead code wants to be told about each and every page
+	 * so it can build and shrink its windows appropriately
+	 *
+	 * For sequential accesses, we use the generic readahead logic.
+	 */
+	if (VM_SequentialReadHint(area))
+		page_cache_readahead(mapping, ra, file, pgoff, 1);
+
+	/*
+	 * Do we have something in the page cache already?
+	 */
+retry_find:
+	page = find_get_page(mapping, pgoff);
+	if (!page) {
+		unsigned long ra_pages;
+
+		if (VM_SequentialReadHint(area)) {
+			handle_ra_miss(mapping, ra, pgoff);
+			goto no_cached_page;
+		}
+		ra->mmap_miss++;
+
+		/*
+		 * Do we miss much more than hit in this file? If so,
+		 * stop bothering with read-ahead. It will only hurt.
+		 */
+		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+			goto no_cached_page;
+
+		/*
+		 * To keep the pgmajfault counter straight, we need to
+		 * check did_readaround, as this is an inner loop.
+		 */
+		if (!did_readaround) {
+			majmin = VM_FAULT_MAJOR;
+			inc_page_state(pgmajfault);
+		}
+		did_readaround = 1;
+		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
+		if (ra_pages) {
+			pgoff_t start = 0;
+
+			if (pgoff > ra_pages / 2)
+				start = pgoff - ra_pages / 2;
+			do_page_cache_readahead(mapping, file, start, ra_pages);
+		}
+		page = find_get_page(mapping, pgoff);
+		if (!page)
+			goto no_cached_page;
+	}
+
+	if (!did_readaround)
+		ra->mmap_hit++;
+
+	/*
+	 * Ok, found a page in the page cache, now we need to check
+	 * that it's up-to-date.
+	 */
+	if (!PageUptodate(page))
+		goto page_not_uptodate;
+
+success:
+	/*
+	 * Found the page and have a reference on it.
+	 */
+	mark_page_accessed(page);
+	if (type)
+		*type = majmin;
+	return page;
+
+outside_data_content:
+	/*
+	 * An external ptracer can access pages that normally aren't
+	 * accessible..
+	 */
+	if (area->vm_mm == current->mm)
+		return NULL;
+	/* Fall through to the non-read-ahead case */
+no_cached_page:
+	/*
+	 * We're only likely to ever get here if MADV_RANDOM is in
+	 * effect.
+	 */
+	error = page_cache_read(file, pgoff);
+	grab_swap_token();
+
+	/*
+	 * The page we want has now been added to the page cache.
+	 * In the unlikely event that someone removed it in the
+	 * meantime, we'll just come back here and read it again.
+	 */
+	if (error >= 0)
+		goto retry_find;
+
+	/*
+	 * An error return from page_cache_read can result if the
+	 * system is low on memory, or a problem occurs while trying
+	 * to schedule I/O.
+	 */
+	if (error == -ENOMEM)
+		return NOPAGE_OOM;
+	return NULL;
+
+page_not_uptodate:
+	if (!did_readaround) {
+		majmin = VM_FAULT_MAJOR;
+		inc_page_state(pgmajfault);
+	}
+	lock_page(page);
+
+	/* Did it get unhashed while we waited for it? */
+	if (!page->mapping) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Did somebody else get it up-to-date? */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		goto success;
+	}
+
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page_locked(page);
+		if (PageUptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Umm, take care of errors if the page isn't up-to-date.
+	 * Try to re-read it _once_. We do this synchronously,
+	 * because there really aren't any performance issues here
+	 * and we need to check for errors.
+	 */
+	lock_page(page);
+
+	/* Somebody truncated the page on us? */
+	if (!page->mapping) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Somebody else successfully read it in? */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		goto success;
+	}
+	ClearPageError(page);
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page_locked(page);
+		if (PageUptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Things didn't work out. Return zero to tell the
+	 * mm layer so, possibly freeing the page cache page first.
+	 */
+	page_cache_release(page);
+	return NULL;
+}
+
+EXPORT_SYMBOL(filemap_nopage);
+
+static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
+					int nonblock)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct page *page;
+	int error;
+
+	/*
+	 * Do we have something in the page cache already?
+	 */
+retry_find:
+	page = find_get_page(mapping, pgoff);
+	if (!page) {
+		if (nonblock)
+			return NULL;
+		goto no_cached_page;
+	}
+
+	/*
+	 * Ok, found a page in the page cache, now we need to check
+	 * that it's up-to-date.
+	 */
+	if (!PageUptodate(page))
+		goto page_not_uptodate;
+
+success:
+	/*
+	 * Found the page and have a reference on it.
+	 */
+	mark_page_accessed(page);
+	return page;
+
+no_cached_page:
+	error = page_cache_read(file, pgoff);
+
+	/*
+	 * The page we want has now been added to the page cache.
+	 * In the unlikely event that someone removed it in the
+	 * meantime, we'll just come back here and read it again.
+	 */
+	if (error >= 0)
+		goto retry_find;
+
+	/*
+	 * An error return from page_cache_read can result if the
+	 * system is low on memory, or a problem occurs while trying
+	 * to schedule I/O.
+	 */
+	return NULL;
+
+page_not_uptodate:
+	lock_page(page);
+
+	/* Did it get unhashed while we waited for it? */
+	if (!page->mapping) {
+		unlock_page(page);
+		goto err;
+	}
+
+	/* Did somebody else get it up-to-date? */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		goto success;
+	}
+
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page_locked(page);
+		if (PageUptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Umm, take care of errors if the page isn't up-to-date.
+	 * Try to re-read it _once_. We do this synchronously,
+	 * because there really aren't any performance issues here
+	 * and we need to check for errors.
+	 */
+	lock_page(page);
+
+	/* Somebody truncated the page on us? */
+	if (!page->mapping) {
+		unlock_page(page);
+		goto err;
+	}
+	/* Somebody else successfully read it in? */
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		goto success;
+	}
+
+	ClearPageError(page);
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page_locked(page);
+		if (PageUptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Things didn't work out. Return zero to tell the
+	 * mm layer so, possibly freeing the page cache page first.
+	 */
+err:
+	page_cache_release(page);
+
+	return NULL;
+}
+
+int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long len, pgprot_t prot, unsigned long pgoff,
+		int nonblock)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long size;
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	int err;
+
+	if (!nonblock)
+		force_page_cache_readahead(mapping, vma->vm_file,
+					pgoff, len >> PAGE_CACHE_SHIFT);
+
+repeat:
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
+		return -EINVAL;
+
+	page = filemap_getpage(file, pgoff, nonblock);
+	if (!page && !nonblock)
+		return -ENOMEM;
+	if (page) {
+		err = install_page(mm, vma, addr, page, prot);
+		if (err) {
+			page_cache_release(page);
+			return err;
+		}
+	} else {
+		err = install_file_pte(mm, vma, addr, pgoff, prot);
+		if (err)
+			return err;
+	}
+
+	len -= PAGE_SIZE;
+	addr += PAGE_SIZE;
+	pgoff++;
+	if (len)
+		goto repeat;
+
+	return 0;
+}
+
+struct vm_operations_struct generic_file_vm_ops = {
+	.nopage		= filemap_nopage,
+	.populate	= filemap_populate,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(filemap_populate);
+
+/*
+ * This is for filesystems which do not implement ->writepage.
+ */
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		return -EINVAL;
+	return generic_file_mmap(file, vma);
+}
+#else
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	return -ENOSYS;
+}
+int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_MMU */
+
+EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap);
+
+static inline struct page *__read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page *page, *cached_page = NULL;
+	int err;
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		if (!cached_page) {
+			cached_page = page_cache_alloc_cold(mapping);
+			if (!cached_page)
+				return ERR_PTR(-ENOMEM);
+		}
+		err = add_to_page_cache_lru(cached_page, mapping,
+					index, GFP_KERNEL);
+		if (err == -EEXIST)
+			goto repeat;
+		if (err < 0) {
+			/* Presumably ENOMEM for radix tree node */
+			page_cache_release(cached_page);
+			return ERR_PTR(err);
+		}
+		page = cached_page;
+		cached_page = NULL;
+		err = filler(data, page);
+		if (err < 0) {
+			page_cache_release(page);
+			page = ERR_PTR(err);
+		}
+	}
+	if (cached_page)
+		page_cache_release(cached_page);
+	return page;
+}
+
+/*
+ * Read into the page cache. If a page already exists,
+ * and PageUptodate() is not set, try to fill the page.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page *page;
+	int err;
+
+retry:
+	page = __read_cache_page(mapping, index, filler, data);
+	if (IS_ERR(page))
+		goto out;
+	mark_page_accessed(page);
+	if (PageUptodate(page))
+		goto out;
+
+	lock_page(page);
+	if (!page->mapping) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry;
+	}
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		goto out;
+	}
+	err = filler(data, page);
+	if (err < 0) {
+		page_cache_release(page);
+		page = ERR_PTR(err);
+	}
+ out:
+	return page;
+}
+
+EXPORT_SYMBOL(read_cache_page);
+
+/*
+ * If the page was newly created, increment its refcount and add it to the
+ * caller's lru-buffering pagevec.  This function is specifically for
+ * generic_file_write().
+ */
+static inline struct page *
+__grab_cache_page(struct address_space *mapping, unsigned long index,
+			struct page **cached_page, struct pagevec *lru_pvec)
+{
+	int err;
+	struct page *page;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		if (!*cached_page) {
+			*cached_page = page_cache_alloc(mapping);
+			if (!*cached_page)
+				return NULL;
+		}
+		err = add_to_page_cache(*cached_page, mapping,
+					index, GFP_KERNEL);
+		if (err == -EEXIST)
+			goto repeat;
+		if (err == 0) {
+			page = *cached_page;
+			page_cache_get(page);
+			if (!pagevec_add(lru_pvec, page))
+				__pagevec_lru_add(lru_pvec);
+			*cached_page = NULL;
+		}
+	}
+	return page;
+}
+
+/*
+ * The logic we want is
+ *
+ *	if suid or (sgid and xgrp)
+ *		remove privs
+ */
+int remove_suid(struct dentry *dentry)
+{
+	mode_t mode = dentry->d_inode->i_mode;
+	int kill = 0;
+	int result = 0;
+
+	/* suid always must be killed */
+	if (unlikely(mode & S_ISUID))
+		kill = ATTR_KILL_SUID;
+
+	/*
+	 * sgid without any exec bits is just a mandatory locking mark; leave
+	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
+	 */
+	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+		kill |= ATTR_KILL_SGID;
+
+	if (unlikely(kill && !capable(CAP_FSETID))) {
+		struct iattr newattrs;
+
+		newattrs.ia_valid = ATTR_FORCE | kill;
+		result = notify_change(dentry, &newattrs);
+	}
+	return result;
+}
+EXPORT_SYMBOL(remove_suid);
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then clear the page
+ * out to (offset+bytes) and return the number of bytes which were copied.
+ */
+static inline size_t
+filemap_copy_from_user(struct page *page, unsigned long offset,
+			const char __user *buf, unsigned bytes)
+{
+	char *kaddr;
+	int left;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (left != 0) {
+		/* Do it the slow way */
+		kaddr = kmap(page);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		kunmap(page);
+	}
+	return bytes - left;
+}
+
+static size_t
+__filemap_copy_from_user_iovec(char *vaddr, 
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left)) {
+			/* zero the rest of the target like __copy_from_user */
+			if (bytes)
+				memset(vaddr, 0, bytes);
+			break;
+		}
+	}
+	return copied - left;
+}
+
+/*
+ * This has the same sideeffects and return value as filemap_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+						base, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+	if (copied != bytes) {
+		kaddr = kmap(page);
+		copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+							base, bytes);
+		kunmap(page);
+	}
+	return copied;
+}
+
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	while (bytes) {
+		int copy = min(bytes, iov->iov_len - base);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	}
+	*iovp = iov;
+	*basep = base;
+}
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position aor amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+{
+	struct inode *inode = file->f_mapping->host;
+	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+
+        if (unlikely(*pos < 0))
+                return -EINVAL;
+
+        if (unlikely(file->f_error)) {
+                int err = file->f_error;
+                file->f_error = 0;
+                return err;
+        }
+
+	if (!isblk) {
+		/* FIXME: this is for backwards compatibility with 2.4 */
+		if (file->f_flags & O_APPEND)
+                        *pos = i_size_read(inode);
+
+		if (limit != RLIM_INFINITY) {
+			if (*pos >= limit) {
+				send_sig(SIGXFSZ, current, 0);
+				return -EFBIG;
+			}
+			if (*count > limit - (typeof(limit))*pos) {
+				*count = limit - (typeof(limit))*pos;
+			}
+		}
+	}
+
+	/*
+	 * LFS rule
+	 */
+	if (unlikely(*pos + *count > MAX_NON_LFS &&
+				!(file->f_flags & O_LARGEFILE))) {
+		if (*pos >= MAX_NON_LFS) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
+			*count = MAX_NON_LFS - (unsigned long)*pos;
+		}
+	}
+
+	/*
+	 * Are we about to exceed the fs block limit ?
+	 *
+	 * If we have written data it becomes a short write.  If we have
+	 * exceeded without writing data we send a signal and return EFBIG.
+	 * Linus frestrict idea will clean these up nicely..
+	 */
+	if (likely(!isblk)) {
+		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
+			if (*count || *pos > inode->i_sb->s_maxbytes) {
+				send_sig(SIGXFSZ, current, 0);
+				return -EFBIG;
+			}
+			/* zero-length writes at ->s_maxbytes are OK */
+		}
+
+		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
+			*count = inode->i_sb->s_maxbytes - *pos;
+	} else {
+		loff_t isize;
+		if (bdev_read_only(I_BDEV(inode)))
+			return -EPERM;
+		isize = i_size_read(inode);
+		if (*pos >= isize) {
+			if (*count || *pos > isize)
+				return -ENOSPC;
+		}
+
+		if (*pos + *count > isize)
+			*count = isize - *pos;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, size_t ocount)
+{
+	struct file	*file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode	*inode = mapping->host;
+	ssize_t		written;
+
+	if (count != ocount)
+		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+	if (written > 0) {
+		loff_t end = pos + written;
+		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+			i_size_write(inode,  end);
+			mark_inode_dirty(inode);
+		}
+		*ppos = end;
+	}
+
+	/*
+	 * Sync the fs metadata but not the minor inode changes and
+	 * of course not the data as we did direct DMA for the IO.
+	 * i_sem is held, which protects generic_osync_inode() from
+	 * livelocking.
+	 */
+	if (written >= 0 && file->f_flags & O_SYNC)
+		generic_osync_inode(inode, mapping, OSYNC_METADATA);
+	if (written == count && !is_sync_kiocb(iocb))
+		written = -EIOCBQUEUED;
+	return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space * mapping = file->f_mapping;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	struct page	*page;
+	struct page	*cached_page = NULL;
+	size_t		bytes;
+	struct pagevec	lru_pvec;
+	const struct iovec *cur_iov = iov; /* current iovec */
+	size_t		iov_base = 0;	   /* offset in the current iovec */
+	char __user	*buf;
+
+	pagevec_init(&lru_pvec, 0);
+
+	/*
+	 * handle partial DIO write.  Adjust cur_iov if needed.
+	 */
+	if (likely(nr_segs == 1))
+		buf = iov->iov_base + written;
+	else {
+		filemap_set_next_iovec(&cur_iov, &iov_base, written);
+		buf = iov->iov_base + iov_base;
+	}
+
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 */
+		fault_in_pages_readable(buf, bytes);
+
+		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+		if (!page) {
+			status = -ENOMEM;
+			break;
+		}
+
+		status = a_ops->prepare_write(file, page, offset, offset+bytes);
+		if (unlikely(status)) {
+			loff_t isize = i_size_read(inode);
+			/*
+			 * prepare_write() may have instantiated a few blocks
+			 * outside i_size.  Trim these off again.
+			 */
+			unlock_page(page);
+			page_cache_release(page);
+			if (pos + bytes > isize)
+				vmtruncate(inode, isize);
+			break;
+		}
+		if (likely(nr_segs == 1))
+			copied = filemap_copy_from_user(page, offset,
+							buf, bytes);
+		else
+			copied = filemap_copy_from_user_iovec(page, offset,
+						cur_iov, iov_base, bytes);
+		flush_dcache_page(page);
+		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (likely(copied > 0)) {
+			if (!status)
+				status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+				if (unlikely(nr_segs > 1))
+					filemap_set_next_iovec(&cur_iov,
+							&iov_base, status);
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		if (status < 0)
+			break;
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (count);
+	*ppos = pos;
+
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	/*
+	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+	 */
+	if (likely(status >= 0)) {
+		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			if (!a_ops->writepage || !is_sync_kiocb(iocb))
+				status = generic_osync_inode(inode, mapping,
+						OSYNC_METADATA|OSYNC_DATA);
+		}
+  	}
+	
+	/*
+	 * If we get here for O_DIRECT writes then we must have fallen through
+	 * to buffered writes (block instantiation inside i_size).  So we sync
+	 * the file data here, to try to honour O_DIRECT expectations.
+	 */
+	if (unlikely(file->f_flags & O_DIRECT) && written)
+		status = filemap_write_and_wait(mapping);
+
+	pagevec_lru_add(&lru_pvec);
+	return written ? written : status;
+}
+EXPORT_SYMBOL(generic_file_buffered_write);
+
+ssize_t
+__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space * mapping = file->f_mapping;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	struct inode 	*inode = mapping->host;
+	unsigned long	seg;
+	loff_t		pos;
+	ssize_t		written;
+	ssize_t		err;
+
+	ocount = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		ocount -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	count = ocount;
+	pos = *ppos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = remove_suid(file->f_dentry);
+	if (err)
+		goto out;
+
+	inode_update_time(inode, 1);
+
+	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		written = generic_file_direct_write(iocb, iov,
+				&nr_segs, pos, ppos, count, ocount);
+		if (written < 0 || written == count)
+			goto out;
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 * for completing the rest of the request.
+		 */
+		pos += written;
+		count -= written;
+	}
+
+	written = generic_file_buffered_write(iocb, iov, nr_segs,
+			pos, ppos, count, written);
+out:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+EXPORT_SYMBOL(generic_file_aio_write_nolock);
+
+ssize_t
+generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	loff_t pos = *ppos;
+
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		int err;
+
+		err = sync_page_range_nolock(inode, mapping, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+ssize_t
+__generic_file_write_nolock(struct file *file, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, file);
+	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+
+ssize_t
+generic_file_write_nolock(struct file *file, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, file);
+	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_write_nolock);
+
+ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
+			       size_t count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	down(&inode->i_sem);
+	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						&iocb->ki_pos);
+	up(&inode->i_sem);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
+
+		err = sync_page_range(inode, mapping, pos, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_aio_write);
+
+ssize_t generic_file_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t	ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
+
+	down(&inode->i_sem);
+	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
+	up(&inode->i_sem);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
+
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_write);
+
+ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
+			unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&kiocb);
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_readv);
+
+ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
+			unsigned long nr_segs, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	down(&inode->i_sem);
+	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
+	up(&inode->i_sem);
+
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		int err;
+
+		err = sync_page_range(inode, mapping, *ppos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_writev);
+
+/*
+ * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * went wrong during pagecache shootdown.
+ */
+ssize_t
+generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	ssize_t retval;
+	size_t write_len = 0;
+
+	/*
+	 * If it's a write, unmap all mmappings of the file up-front.  This
+	 * will cause any pte dirty bits to be propagated into the pageframes
+	 * for the subsequent filemap_write_and_wait().
+	 */
+	if (rw == WRITE) {
+		write_len = iov_length(iov, nr_segs);
+	       	if (mapping_mapped(mapping))
+			unmap_mapping_range(mapping, offset, write_len, 0);
+	}
+
+	retval = filemap_write_and_wait(mapping);
+	if (retval == 0) {
+		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+						offset, nr_segs);
+		if (rw == WRITE && mapping->nrpages) {
+			pgoff_t end = (offset + write_len - 1)
+						>> PAGE_CACHE_SHIFT;
+			int err = invalidate_inode_pages2_range(mapping,
+					offset >> PAGE_CACHE_SHIFT, end);
+			if (err)
+				retval = err;
+		}
+	}
+	return retval;
+}
+EXPORT_SYMBOL_GPL(generic_file_direct_IO);
diff --git a/mm/fremap.c b/mm/fremap.c
new file mode 100644
index 000000000000..3235fb77c133
--- /dev/null
+++ b/mm/fremap.c
@@ -0,0 +1,256 @@
+/*
+ *   linux/mm/fremap.c
+ * 
+ * Explicit pagetable population and nonlinear (random) mappings support.
+ *
+ * started by Ingo Molnar, Copyright (C) 2002, 2003
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+
+	if (pte_none(pte))
+		return;
+	if (pte_present(pte)) {
+		unsigned long pfn = pte_pfn(pte);
+
+		flush_cache_page(vma, addr, pfn);
+		pte = ptep_clear_flush(vma, addr, ptep);
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+			if (!PageReserved(page)) {
+				if (pte_dirty(pte))
+					set_page_dirty(page);
+				page_remove_rmap(page);
+				page_cache_release(page);
+				dec_mm_counter(mm, rss);
+			}
+		}
+	} else {
+		if (!pte_file(pte))
+			free_swap_and_cache(pte_to_swp_entry(pte));
+		pte_clear(mm, addr, ptep);
+	}
+}
+
+/*
+ * Install a file page to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, struct page *page, pgprot_t prot)
+{
+	struct inode *inode;
+	pgoff_t size;
+	int err = -ENOMEM;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	pte_t pte_val;
+
+	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto err_unlock;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		goto err_unlock;
+
+	/*
+	 * This page may have been truncated. Tell the
+	 * caller about it.
+	 */
+	err = -EINVAL;
+	inode = vma->vm_file->f_mapping->host;
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (!page->mapping || page->index >= size)
+		goto err_unlock;
+
+	zap_pte(mm, vma, addr, pte);
+
+	inc_mm_counter(mm,rss);
+	flush_icache_page(vma, page);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	page_add_file_rmap(page);
+	pte_val = *pte;
+	pte_unmap(pte);
+	update_mmu_cache(vma, addr, pte_val);
+
+	err = 0;
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+EXPORT_SYMBOL(install_page);
+
+
+/*
+ * Install a file pte to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long pgoff, pgprot_t prot)
+{
+	int err = -ENOMEM;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	pte_t pte_val;
+
+	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto err_unlock;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		goto err_unlock;
+
+	zap_pte(mm, vma, addr, pte);
+
+	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+	pte_val = *pte;
+	pte_unmap(pte);
+	update_mmu_cache(vma, addr, pte_val);
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+
+
+/***
+ * sys_remap_file_pages - remap arbitrary pages of a shared backing store
+ *                        file within an existing vma.
+ * @start: start of the remapped virtual memory range
+ * @size: size of the remapped virtual memory range
+ * @prot: new protection bits of the range
+ * @pgoff: to be mapped page of the backing store file
+ * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
+ *
+ * this syscall works purely via pagetables, so it's the most efficient
+ * way to map the same (large) file into a given virtual window. Unlike
+ * mmap()/mremap() it does not create any new vmas. The new mappings are
+ * also safe across swapout.
+ *
+ * NOTE: the 'prot' parameter right now is ignored, and the vma's default
+ * protection is used. Arbitrary protections might be implemented in the
+ * future.
+ */
+asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+	unsigned long __prot, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct address_space *mapping;
+	unsigned long end = start + size;
+	struct vm_area_struct *vma;
+	int err = -EINVAL;
+	int has_write_lock = 0;
+
+	if (__prot)
+		return err;
+	/*
+	 * Sanitize the syscall parameters:
+	 */
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	if (start + size <= start)
+		return err;
+
+	/* Can we represent this offset inside this architecture's pte's? */
+#if PTE_FILE_MAX_BITS < BITS_PER_LONG
+	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
+		return err;
+#endif
+
+	/* We need down_write() to change vma->vm_flags. */
+	down_read(&mm->mmap_sem);
+ retry:
+	vma = find_vma(mm, start);
+
+	/*
+	 * Make sure the vma is shared, that it supports prefaulting,
+	 * and that the remapped range is valid and fully within
+	 * the single existing vma.  vm_private_data is used as a
+	 * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
+	 * or VM_LOCKED, but VM_LOCKED could be revoked later on).
+	 */
+	if (vma && (vma->vm_flags & VM_SHARED) &&
+		(!vma->vm_private_data ||
+			(vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
+		vma->vm_ops && vma->vm_ops->populate &&
+			end > start && start >= vma->vm_start &&
+				end <= vma->vm_end) {
+
+		/* Must set VM_NONLINEAR before any pages are populated. */
+		if (pgoff != linear_page_index(vma, start) &&
+		    !(vma->vm_flags & VM_NONLINEAR)) {
+			if (!has_write_lock) {
+				up_read(&mm->mmap_sem);
+				down_write(&mm->mmap_sem);
+				has_write_lock = 1;
+				goto retry;
+			}
+			mapping = vma->vm_file->f_mapping;
+			spin_lock(&mapping->i_mmap_lock);
+			flush_dcache_mmap_lock(mapping);
+			vma->vm_flags |= VM_NONLINEAR;
+			vma_prio_tree_remove(vma, &mapping->i_mmap);
+			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+			flush_dcache_mmap_unlock(mapping);
+			spin_unlock(&mapping->i_mmap_lock);
+		}
+
+		err = vma->vm_ops->populate(vma, start, size,
+					    vma->vm_page_prot,
+					    pgoff, flags & MAP_NONBLOCK);
+
+		/*
+		 * We can't clear VM_NONLINEAR because we'd have to do
+		 * it after ->populate completes, and that would prevent
+		 * downgrading the lock.  (Locks can't be upgraded).
+		 */
+	}
+	if (likely(!has_write_lock))
+		up_read(&mm->mmap_sem);
+	else
+		up_write(&mm->mmap_sem);
+
+	return err;
+}
+
diff --git a/mm/highmem.c b/mm/highmem.c
new file mode 100644
index 000000000000..d01276506b00
--- /dev/null
+++ b/mm/highmem.c
@@ -0,0 +1,607 @@
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+
+static mempool_t *page_pool, *isa_page_pool;
+
+static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+{
+	unsigned int gfp = gfp_mask | (unsigned int) (long) data;
+
+	return alloc_page(gfp);
+}
+
+static void page_pool_free(void *page, void *data)
+{
+	__free_page(page);
+}
+
+/*
+ * Virtual_count is not a pure "count".
+ *  0 means that it is not mapped, and has not been mapped
+ *    since a TLB flush - it is usable.
+ *  1 means that there are no users, but it has been mapped
+ *    since the last TLB flush - so we can't use it.
+ *  n means that there are (n-1) current users of it.
+ */
+#ifdef CONFIG_HIGHMEM
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr;
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+	int i;
+
+	flush_cache_kmaps();
+
+	for (i = 0; i < LAST_PKMAP; i++) {
+		struct page *page;
+
+		/*
+		 * zero means we don't have anything to do,
+		 * >1 means that it is still in use. Only
+		 * a count of 1 means that it is free but
+		 * needs to be unmapped
+		 */
+		if (pkmap_count[i] != 1)
+			continue;
+		pkmap_count[i] = 0;
+
+		/* sanity check */
+		if (pte_none(pkmap_page_table[i]))
+			BUG();
+
+		/*
+		 * Don't need an atomic fetch-and-clear op here;
+		 * no-one has the page mapped, and cannot get at
+		 * its virtual address (and hence PTE) without first
+		 * getting the kmap_lock (which is held here).
+		 * So no dangers, even with speculative execution.
+		 */
+		page = pte_page(pkmap_page_table[i]);
+		pte_clear(&init_mm, (unsigned long)page_address(page),
+			  &pkmap_page_table[i]);
+
+		set_page_address(page, NULL);
+	}
+	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+static inline unsigned long map_new_virtual(struct page *page)
+{
+	unsigned long vaddr;
+	int count;
+
+start:
+	count = LAST_PKMAP;
+	/* Find an empty entry */
+	for (;;) {
+		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+		if (!last_pkmap_nr) {
+			flush_all_zero_pkmaps();
+			count = LAST_PKMAP;
+		}
+		if (!pkmap_count[last_pkmap_nr])
+			break;	/* Found a usable entry */
+		if (--count)
+			continue;
+
+		/*
+		 * Sleep for somebody else to unmap their entries
+		 */
+		{
+			DECLARE_WAITQUEUE(wait, current);
+
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			add_wait_queue(&pkmap_map_wait, &wait);
+			spin_unlock(&kmap_lock);
+			schedule();
+			remove_wait_queue(&pkmap_map_wait, &wait);
+			spin_lock(&kmap_lock);
+
+			/* Somebody else might have mapped it while we slept */
+			if (page_address(page))
+				return (unsigned long)page_address(page);
+
+			/* Re-start */
+			goto start;
+		}
+	}
+	vaddr = PKMAP_ADDR(last_pkmap_nr);
+	set_pte_at(&init_mm, vaddr,
+		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+
+	pkmap_count[last_pkmap_nr] = 1;
+	set_page_address(page, (void *)vaddr);
+
+	return vaddr;
+}
+
+void fastcall *kmap_high(struct page *page)
+{
+	unsigned long vaddr;
+
+	/*
+	 * For highmem pages, we can't trust "virtual" until
+	 * after we have the lock.
+	 *
+	 * We cannot call this from interrupts, as it may block
+	 */
+	spin_lock(&kmap_lock);
+	vaddr = (unsigned long)page_address(page);
+	if (!vaddr)
+		vaddr = map_new_virtual(page);
+	pkmap_count[PKMAP_NR(vaddr)]++;
+	if (pkmap_count[PKMAP_NR(vaddr)] < 2)
+		BUG();
+	spin_unlock(&kmap_lock);
+	return (void*) vaddr;
+}
+
+EXPORT_SYMBOL(kmap_high);
+
+void fastcall kunmap_high(struct page *page)
+{
+	unsigned long vaddr;
+	unsigned long nr;
+	int need_wakeup;
+
+	spin_lock(&kmap_lock);
+	vaddr = (unsigned long)page_address(page);
+	if (!vaddr)
+		BUG();
+	nr = PKMAP_NR(vaddr);
+
+	/*
+	 * A count must never go down to zero
+	 * without a TLB flush!
+	 */
+	need_wakeup = 0;
+	switch (--pkmap_count[nr]) {
+	case 0:
+		BUG();
+	case 1:
+		/*
+		 * Avoid an unnecessary wake_up() function call.
+		 * The common case is pkmap_count[] == 1, but
+		 * no waiters.
+		 * The tasks queued in the wait-queue are guarded
+		 * by both the lock in the wait-queue-head and by
+		 * the kmap_lock.  As the kmap_lock is held here,
+		 * no need for the wait-queue-head's lock.  Simply
+		 * test if the queue is empty.
+		 */
+		need_wakeup = waitqueue_active(&pkmap_map_wait);
+	}
+	spin_unlock(&kmap_lock);
+
+	/* do wake-up, if needed, race-free outside of the spin lock */
+	if (need_wakeup)
+		wake_up(&pkmap_map_wait);
+}
+
+EXPORT_SYMBOL(kunmap_high);
+
+#define POOL_SIZE	64
+
+static __init int init_emergency_pool(void)
+{
+	struct sysinfo i;
+	si_meminfo(&i);
+	si_swapinfo(&i);
+        
+	if (!i.totalhigh)
+		return 0;
+
+	page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+	if (!page_pool)
+		BUG();
+	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+	unsigned long flags;
+	unsigned char *vto;
+
+	local_irq_save(flags);
+	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+	kunmap_atomic(vto, KM_BOUNCE_READ);
+	local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom)	\
+	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif
+
+#define ISA_POOL_SIZE	16
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+	if (isa_page_pool)
+		return 0;
+
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
+	if (!isa_page_pool)
+		BUG();
+
+	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+	return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+	unsigned char *vfrom;
+	struct bio_vec *tovec, *fromvec;
+	int i;
+
+	__bio_for_each_segment(tovec, to, i, 0) {
+		fromvec = from->bi_io_vec + i;
+
+		/*
+		 * not bounced
+		 */
+		if (tovec->bv_page == fromvec->bv_page)
+			continue;
+
+		/*
+		 * fromvec->bv_offset and fromvec->bv_len might have been
+		 * modified by the block layer, so use the original copy,
+		 * bounce_copy_vec already uses tovec->bv_len
+		 */
+		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+		flush_dcache_page(tovec->bv_page);
+		bounce_copy_vec(tovec, vfrom);
+	}
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+	struct bio_vec *bvec, *org_vec;
+	int i;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+	/*
+	 * free up bounce indirect pages used
+	 */
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		org_vec = bio_orig->bi_io_vec + i;
+		if (bvec->bv_page == org_vec->bv_page)
+			continue;
+
+		mempool_free(bvec->bv_page, pool);	
+	}
+
+	bio_endio(bio_orig, bio_orig->bi_size, err);
+	bio_put(bio);
+}
+
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		copy_to_high_bio_irq(bio_orig, bio);
+
+	bounce_end_io(bio, pool, err);
+}
+
+static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
+			mempool_t *pool)
+{
+	struct page *page;
+	struct bio *bio = NULL;
+	int i, rw = bio_data_dir(*bio_orig);
+	struct bio_vec *to, *from;
+
+	bio_for_each_segment(from, *bio_orig, i) {
+		page = from->bv_page;
+
+		/*
+		 * is destination page below bounce pfn?
+		 */
+		if (page_to_pfn(page) < q->bounce_pfn)
+			continue;
+
+		/*
+		 * irk, bounce it
+		 */
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+		to = bio->bi_io_vec + i;
+
+		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		to->bv_len = from->bv_len;
+		to->bv_offset = from->bv_offset;
+
+		if (rw == WRITE) {
+			char *vto, *vfrom;
+
+			flush_dcache_page(from->bv_page);
+			vto = page_address(to->bv_page) + to->bv_offset;
+			vfrom = kmap(from->bv_page) + from->bv_offset;
+			memcpy(vto, vfrom, to->bv_len);
+			kunmap(from->bv_page);
+		}
+	}
+
+	/*
+	 * no pages bounced
+	 */
+	if (!bio)
+		return;
+
+	/*
+	 * at least one page was bounced, fill in possible non-highmem
+	 * pages
+	 */
+	__bio_for_each_segment(from, *bio_orig, i, 0) {
+		to = bio_iovec_idx(bio, i);
+		if (!to->bv_page) {
+			to->bv_page = from->bv_page;
+			to->bv_len = from->bv_len;
+			to->bv_offset = from->bv_offset;
+		}
+	}
+
+	bio->bi_bdev = (*bio_orig)->bi_bdev;
+	bio->bi_flags |= (1 << BIO_BOUNCED);
+	bio->bi_sector = (*bio_orig)->bi_sector;
+	bio->bi_rw = (*bio_orig)->bi_rw;
+
+	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+	bio->bi_idx = (*bio_orig)->bi_idx;
+	bio->bi_size = (*bio_orig)->bi_size;
+
+	if (pool == page_pool) {
+		bio->bi_end_io = bounce_end_io_write;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read;
+	} else {
+		bio->bi_end_io = bounce_end_io_write_isa;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read_isa;
+	}
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}
+
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
+{
+	mempool_t *pool;
+
+	/*
+	 * for non-isa bounce case, just check if the bounce pfn is equal
+	 * to or bigger than the highest pfn in the system -- in that case,
+	 * don't waste time iterating over bio segments
+	 */
+	if (!(q->bounce_gfp & GFP_DMA)) {
+		if (q->bounce_pfn >= blk_max_pfn)
+			return;
+		pool = page_pool;
+	} else {
+		BUG_ON(!isa_page_pool);
+		pool = isa_page_pool;
+	}
+
+	/*
+	 * slow path
+	 */
+	__blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
+
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER	7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+	struct page *page;
+	void *virtual;
+	struct list_head list;
+};
+
+/*
+ * page_address_map freelist, allocated from page_address_maps.
+ */
+static struct list_head page_address_pool;	/* freelist */
+static spinlock_t pool_lock;			/* protects page_address_pool */
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+	struct list_head lh;			/* List of page_address_maps */
+	spinlock_t lock;			/* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(struct page *page)
+{
+	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	ret = NULL;
+	spin_lock_irqsave(&pas->lock, flags);
+	if (!list_empty(&pas->lh)) {
+		struct page_address_map *pam;
+
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				ret = pam->virtual;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock_irqrestore(&pas->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(page_address);
+
+void set_page_address(struct page *page, void *virtual)
+{
+	unsigned long flags;
+	struct page_address_slot *pas;
+	struct page_address_map *pam;
+
+	BUG_ON(!PageHighMem(page));
+
+	pas = page_slot(page);
+	if (virtual) {		/* Add */
+		BUG_ON(list_empty(&page_address_pool));
+
+		spin_lock_irqsave(&pool_lock, flags);
+		pam = list_entry(page_address_pool.next,
+				struct page_address_map, list);
+		list_del(&pam->list);
+		spin_unlock_irqrestore(&pool_lock, flags);
+
+		pam->page = page;
+		pam->virtual = virtual;
+
+		spin_lock_irqsave(&pas->lock, flags);
+		list_add_tail(&pam->list, &pas->lh);
+		spin_unlock_irqrestore(&pas->lock, flags);
+	} else {		/* Remove */
+		spin_lock_irqsave(&pas->lock, flags);
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				list_del(&pam->list);
+				spin_unlock_irqrestore(&pas->lock, flags);
+				spin_lock_irqsave(&pool_lock, flags);
+				list_add_tail(&pam->list, &page_address_pool);
+				spin_unlock_irqrestore(&pool_lock, flags);
+				goto done;
+			}
+		}
+		spin_unlock_irqrestore(&pas->lock, flags);
+	}
+done:
+	return;
+}
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+void __init page_address_init(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
+		list_add(&page_address_maps[i].list, &page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+		INIT_LIST_HEAD(&page_address_htable[i].lh);
+		spin_lock_init(&page_address_htable[i].lock);
+	}
+	spin_lock_init(&pool_lock);
+}
+
+#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 000000000000..4eb5ae3fbe10
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,260 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/nodemask.h>
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static unsigned long nr_huge_pages, free_huge_pages;
+unsigned long max_huge_pages;
+static struct list_head hugepage_freelists[MAX_NUMNODES];
+static unsigned int nr_huge_pages_node[MAX_NUMNODES];
+static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static DEFINE_SPINLOCK(hugetlb_lock);
+
+static void enqueue_huge_page(struct page *page)
+{
+	int nid = page_to_nid(page);
+	list_add(&page->lru, &hugepage_freelists[nid]);
+	free_huge_pages++;
+	free_huge_pages_node[nid]++;
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next,
+				  struct page, lru);
+		list_del(&page->lru);
+		free_huge_pages--;
+		free_huge_pages_node[nid]--;
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+					HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % num_online_nodes();
+	if (page) {
+		nr_huge_pages++;
+		nr_huge_pages_node[page_to_nid(page)]++;
+	}
+	return page;
+}
+
+void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+	page[1].mapping = NULL;
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
+struct page *alloc_huge_page(void)
+{
+	struct page *page;
+	int i;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page();
+	if (!page) {
+		spin_unlock(&hugetlb_lock);
+		return NULL;
+	}
+	spin_unlock(&hugetlb_lock);
+	set_page_count(page, 1);
+	page[1].mapping = (void *)free_huge_page;
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+		clear_highpage(&page[i]);
+	return page;
+}
+
+static int __init hugetlb_init(void)
+{
+	unsigned long i;
+	struct page *page;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
+	for (i = 0; i < max_huge_pages; ++i) {
+		page = alloc_fresh_huge_page();
+		if (!page)
+			break;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	return 0;
+}
+module_init(hugetlb_init);
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+		max_huge_pages = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+#ifdef CONFIG_SYSCTL
+static void update_and_free_page(struct page *page)
+{
+	int i;
+	nr_huge_pages--;
+	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
+	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(&page[i], 0);
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(unsigned long count)
+{
+	int i, nid;
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page, *next;
+		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+			if (PageHighMem(page))
+				continue;
+			list_del(&page->lru);
+			update_and_free_page(page);
+			nid = page_zone(page)->zone_pgdat->node_id;
+			free_huge_pages--;
+			free_huge_pages_node[nid]--;
+			if (count >= nr_huge_pages)
+				return;
+		}
+	}
+}
+#else
+static inline void try_to_free_low(unsigned long count)
+{
+}
+#endif
+
+static unsigned long set_max_huge_pages(unsigned long count)
+{
+	while (count > nr_huge_pages) {
+		struct page *page = alloc_fresh_huge_page();
+		if (!page)
+			return nr_huge_pages;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	if (count >= nr_huge_pages)
+		return nr_huge_pages;
+
+	spin_lock(&hugetlb_lock);
+	try_to_free_low(count);
+	while (count < nr_huge_pages) {
+		struct page *page = dequeue_huge_page();
+		if (!page)
+			break;
+		update_and_free_page(page);
+	}
+	spin_unlock(&hugetlb_lock);
+	return nr_huge_pages;
+}
+
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+			   struct file *file, void __user *buffer,
+			   size_t *length, loff_t *ppos)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			nr_huge_pages,
+			free_huge_pages,
+			HPAGE_SIZE/1024);
+}
+
+int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+	return sprintf(buf,
+		"Node %d HugePages_Total: %5u\n"
+		"Node %d HugePages_Free:  %5u\n",
+		nid, nr_huge_pages_node[nid],
+		nid, free_huge_pages_node[nid]);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+EXPORT_SYMBOL(hugetlb_total_pages);
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
diff --git a/mm/internal.h b/mm/internal.h
new file mode 100644
index 000000000000..6bf134e8fb3d
--- /dev/null
+++ b/mm/internal.h
@@ -0,0 +1,13 @@
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* page_alloc.c */
+extern void set_page_refs(struct page *page, int order);
diff --git a/mm/madvise.c b/mm/madvise.c
new file mode 100644
index 000000000000..944b5e52d812
--- /dev/null
+++ b/mm/madvise.c
@@ -0,0 +1,242 @@
+/*
+ *	linux/mm/madvise.c
+ *
+ * Copyright (C) 1999  Linus Torvalds
+ * Copyright (C) 2002  Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
+			     unsigned long end, int behavior)
+{
+	struct mm_struct * mm = vma->vm_mm;
+	int error = 0;
+
+	if (start != vma->vm_start) {
+		error = split_vma(mm, vma, start, 1);
+		if (error)
+			goto out;
+	}
+
+	if (end != vma->vm_end) {
+		error = split_vma(mm, vma, end, 0);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * vm_flags is protected by the mmap_sem held in write mode.
+	 */
+	VM_ClearReadHint(vma);
+
+	switch (behavior) {
+	case MADV_SEQUENTIAL:
+		vma->vm_flags |= VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		vma->vm_flags |= VM_RAND_READ;
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
+/*
+ * Schedule all required I/O operations.  Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct file *file = vma->vm_file;
+
+	if (!file)
+		return -EBADF;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	force_page_cache_readahead(file->f_mapping,
+			file, start, max_sane_readahead(end - start));
+	return 0;
+}
+
+/*
+ * Application no longer needs these pages.  If the pages are dirty,
+ * it's OK to just throw them away.  The app will be more careful about
+ * data it wants to keep.  Be sure to free swap resources too.  The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do.  This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them.  There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+		return -EINVAL;
+
+	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+		struct zap_details details = {
+			.nonlinear_vma = vma,
+			.last_index = ULONG_MAX,
+		};
+		zap_page_range(vma, start, end - start, &details);
+	} else
+		zap_page_range(vma, start, end - start, NULL);
+	return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+			unsigned long end, int behavior)
+{
+	long error = -EBADF;
+
+	switch (behavior) {
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+		error = madvise_behavior(vma, start, end, behavior);
+		break;
+
+	case MADV_WILLNEED:
+		error = madvise_willneed(vma, start, end);
+		break;
+
+	case MADV_DONTNEED:
+		error = madvise_dontneed(vma, start, end);
+		break;
+
+	default:
+		error = -EINVAL;
+		break;
+	}
+		
+	return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area.  The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques.  The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ *  MADV_NORMAL - the default behavior is to read clusters.  This
+ *		results in some read-ahead and read-behind.
+ *  MADV_RANDOM - the system should read the minimum amount of data
+ *		on any access, since it is unlikely that the appli-
+ *		cation will need more than what it asks for.
+ *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ *		once, so they can be aggressively read ahead, and
+ *		can be freed soon after they are accessed.
+ *  MADV_WILLNEED - the application is notifying the system to read
+ *		some pages ahead.
+ *  MADV_DONTNEED - the application is finished with the given range,
+ *		so the kernel can free resources associated with it.
+ *
+ * return values:
+ *  zero    - success
+ *  -EINVAL - start + len < 0, start is not page-aligned,
+ *		"behavior" is not a valid value, or application
+ *		is attempting to release locked or shared pages.
+ *  -ENOMEM - addresses in the specified range are not currently
+ *		mapped, or are outside the AS of the process.
+ *  -EIO    - an I/O error occurred while paging in data.
+ *  -EBADF  - map exists, but area maps something that isn't a file.
+ *  -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+	size_t len;
+
+	down_write(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		goto out;
+
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = madvise_vma(vma, start, end,
+							behavior);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
diff --git a/mm/memory.c b/mm/memory.c
new file mode 100644
index 000000000000..fb6e5deb873a
--- /dev/null
+++ b/mm/memory.c
@@ -0,0 +1,2165 @@
+/*
+ *  linux/mm/memory.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+/*
+ * demand-loading started 01.12.91 - seems it is high on the list of
+ * things wanted, and it should be easy to implement. - Linus
+ */
+
+/*
+ * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
+ * pages started 02.12.91, seems to work. - Linus.
+ *
+ * Tested sharing by executing about 30 /bin/sh: under the old kernel it
+ * would have taken more than the 6M I have free, but it worked well as
+ * far as I could see.
+ *
+ * Also corrected some "invalidate()"s - I wasn't doing enough of them.
+ */
+
+/*
+ * Real VM (paging to/from disk) started 18.12.91. Much more work and
+ * thought has to go into this. Oh, well..
+ * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
+ *		Found it. Everything seems to work now.
+ * 20.12.91  -  Ok, making the swap-device changeable like the root.
+ */
+
+/*
+ * 05.04.94  -  Multi-page memory management added for v1.1.
+ * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ *		(Gerhard.Wichert@pdb.siemens.de)
+ *
+ * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+
+#include <linux/swapops.h>
+#include <linux/elf.h>
+
+#ifndef CONFIG_DISCONTIGMEM
+/* use the per-pgdat data instead for discontigmem - mbligh */
+unsigned long max_mapnr;
+struct page *mem_map;
+
+EXPORT_SYMBOL(max_mapnr);
+EXPORT_SYMBOL(mem_map);
+#endif
+
+unsigned long num_physpages;
+/*
+ * A number of key systems in x86 including ioremap() rely on the assumption
+ * that high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
+ * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
+void * high_memory;
+unsigned long vmalloc_earlyreserve;
+
+EXPORT_SYMBOL(num_physpages);
+EXPORT_SYMBOL(high_memory);
+EXPORT_SYMBOL(vmalloc_earlyreserve);
+
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none.  Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+	pgd_ERROR(*pgd);
+	pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+	pud_ERROR(*pud);
+	pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+	pmd_ERROR(*pmd);
+	pmd_clear(pmd);
+}
+
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
+static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+				unsigned long addr, unsigned long end)
+{
+	if (!((addr | end) & ~PMD_MASK)) {
+		/* Only free fully aligned ranges */
+		struct page *page = pmd_page(*pmd);
+		pmd_clear(pmd);
+		dec_page_state(nr_page_table_pages);
+		tlb->mm->nr_ptes--;
+		pte_free_tlb(tlb, page);
+	}
+}
+
+static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				unsigned long addr, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	pmd_t *empty_pmd = NULL;
+
+	pmd = pmd_offset(pud, addr);
+
+	/* Only free fully aligned ranges */
+	if (!((addr | end) & ~PUD_MASK))
+		empty_pmd = pmd;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		clear_pte_range(tlb, pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+
+	if (empty_pmd) {
+		pud_clear(pud);
+		pmd_free_tlb(tlb, empty_pmd);
+	}
+}
+
+static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				unsigned long addr, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+	pud_t *empty_pud = NULL;
+
+	pud = pud_offset(pgd, addr);
+
+	/* Only free fully aligned ranges */
+	if (!((addr | end) & ~PGDIR_MASK))
+		empty_pud = pud;
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		clear_pmd_range(tlb, pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+
+	if (empty_pud) {
+		pgd_clear(pgd);
+		pud_free_tlb(tlb, empty_pud);
+	}
+}
+
+/*
+ * This function clears user-level page tables of a process.
+ * Unlike other pagetable walks, some memory layouts might give end 0.
+ * Must be called with pagetable lock held.
+ */
+void clear_page_range(struct mmu_gather *tlb,
+				unsigned long addr, unsigned long end)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	pgd = pgd_offset(tlb->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		clear_pud_range(tlb, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		struct page *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free(new);
+			goto out;
+		}
+		mm->nr_ptes++;
+		inc_page_state(nr_page_table_pages);
+		pmd_populate(mm, pmd, new);
+	}
+out:
+	return pte_offset_map(pmd, address);
+}
+
+pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (!pmd_present(*pmd)) {
+		pte_t *new;
+
+		spin_unlock(&mm->page_table_lock);
+		new = pte_alloc_one_kernel(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pmd_present(*pmd)) {
+			pte_free_kernel(new);
+			goto out;
+		}
+		pmd_populate_kernel(mm, pmd, new);
+	}
+out:
+	return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
+ *
+ * dst->page_table_lock is held on entry and exit,
+ * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
+ */
+
+static inline void
+copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+		unsigned long addr)
+{
+	pte_t pte = *src_pte;
+	struct page *page;
+	unsigned long pfn;
+
+	/* pte contains position in swap or file, so copy. */
+	if (unlikely(!pte_present(pte))) {
+		if (!pte_file(pte)) {
+			swap_duplicate(pte_to_swp_entry(pte));
+			/* make sure dst_mm is on swapoff's mmlist. */
+			if (unlikely(list_empty(&dst_mm->mmlist))) {
+				spin_lock(&mmlist_lock);
+				list_add(&dst_mm->mmlist, &src_mm->mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+		}
+		set_pte_at(dst_mm, addr, dst_pte, pte);
+		return;
+	}
+
+	pfn = pte_pfn(pte);
+	/* the pte points outside of valid memory, the
+	 * mapping is assumed to be good, meaningful
+	 * and not mapped via rmap - duplicate the
+	 * mapping as is.
+	 */
+	page = NULL;
+	if (pfn_valid(pfn))
+		page = pfn_to_page(pfn);
+
+	if (!page || PageReserved(page)) {
+		set_pte_at(dst_mm, addr, dst_pte, pte);
+		return;
+	}
+
+	/*
+	 * If it's a COW mapping, write protect it both
+	 * in the parent and the child
+	 */
+	if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
+		ptep_set_wrprotect(src_mm, addr, src_pte);
+		pte = *src_pte;
+	}
+
+	/*
+	 * If it's a shared mapping, mark it clean in
+	 * the child
+	 */
+	if (vm_flags & VM_SHARED)
+		pte = pte_mkclean(pte);
+	pte = pte_mkold(pte);
+	get_page(page);
+	inc_mm_counter(dst_mm, rss);
+	if (PageAnon(page))
+		inc_mm_counter(dst_mm, anon_rss);
+	set_pte_at(dst_mm, addr, dst_pte, pte);
+	page_dup_rmap(page);
+}
+
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pte_t *src_pte, *dst_pte;
+	unsigned long vm_flags = vma->vm_flags;
+	int progress;
+
+again:
+	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+	if (!dst_pte)
+		return -ENOMEM;
+	src_pte = pte_offset_map_nested(src_pmd, addr);
+
+	progress = 0;
+	spin_lock(&src_mm->page_table_lock);
+	do {
+		/*
+		 * We are holding two locks at this point - either of them
+		 * could generate latencies in another task on another CPU.
+		 */
+		if (progress >= 32 && (need_resched() ||
+		    need_lockbreak(&src_mm->page_table_lock) ||
+		    need_lockbreak(&dst_mm->page_table_lock)))
+			break;
+		if (pte_none(*src_pte)) {
+			progress++;
+			continue;
+		}
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+		progress += 8;
+	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+	spin_unlock(&src_mm->page_table_lock);
+
+	pte_unmap_nested(src_pte - 1);
+	pte_unmap(dst_pte - 1);
+	cond_resched_lock(&dst_mm->page_table_lock);
+	if (addr != end)
+		goto again;
+	return 0;
+}
+
+static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pmd_t *src_pmd, *dst_pmd;
+	unsigned long next;
+
+	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
+	if (!dst_pmd)
+		return -ENOMEM;
+	src_pmd = pmd_offset(src_pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(src_pmd))
+			continue;
+		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	pud_t *src_pud, *dst_pud;
+	unsigned long next;
+
+	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
+	if (!dst_pud)
+		return -ENOMEM;
+	src_pud = pud_offset(src_pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(src_pud))
+			continue;
+		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pud++, src_pud++, addr = next, addr != end);
+	return 0;
+}
+
+int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		struct vm_area_struct *vma)
+{
+	pgd_t *src_pgd, *dst_pgd;
+	unsigned long next;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	if (is_vm_hugetlb_page(vma))
+		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+
+	dst_pgd = pgd_offset(dst_mm, addr);
+	src_pgd = pgd_offset(src_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(src_pgd))
+			continue;
+		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	pte_t *pte;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		pte_t ptent = *pte;
+		if (pte_none(ptent))
+			continue;
+		if (pte_present(ptent)) {
+			struct page *page = NULL;
+			unsigned long pfn = pte_pfn(ptent);
+			if (pfn_valid(pfn)) {
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					page = NULL;
+			}
+			if (unlikely(details) && page) {
+				/*
+				 * unmap_shared_mapping_pages() wants to
+				 * invalidate cache without truncating:
+				 * unmap shared but keep private pages.
+				 */
+				if (details->check_mapping &&
+				    details->check_mapping != page->mapping)
+					continue;
+				/*
+				 * Each page->index must be checked when
+				 * invalidating or truncating nonlinear.
+				 */
+				if (details->nonlinear_vma &&
+				    (page->index < details->first_index ||
+				     page->index > details->last_index))
+					continue;
+			}
+			ptent = ptep_get_and_clear(tlb->mm, addr, pte);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+			if (unlikely(!page))
+				continue;
+			if (unlikely(details) && details->nonlinear_vma
+			    && linear_page_index(details->nonlinear_vma,
+						addr) != page->index)
+				set_pte_at(tlb->mm, addr, pte,
+					   pgoff_to_pte(page->index));
+			if (pte_dirty(ptent))
+				set_page_dirty(page);
+			if (PageAnon(page))
+				dec_mm_counter(tlb->mm, anon_rss);
+			else if (pte_young(ptent))
+				mark_page_accessed(page);
+			tlb->freed++;
+			page_remove_rmap(page);
+			tlb_remove_page(tlb, page);
+			continue;
+		}
+		/*
+		 * If details->check_mapping, we leave swap entries;
+		 * if details->nonlinear_vma, we leave file entries.
+		 */
+		if (unlikely(details))
+			continue;
+		if (!pte_file(ptent))
+			free_swap_and_cache(pte_to_swp_entry(ptent));
+		pte_clear(tlb->mm, addr, pte);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+}
+
+static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		zap_pte_range(tlb, pmd, addr, next, details);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		zap_pmd_range(tlb, pud, addr, next, details);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	if (details && !details->check_mapping && !details->nonlinear_vma)
+		details = NULL;
+
+	BUG_ON(addr >= end);
+	tlb_start_vma(tlb, vma);
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		zap_pud_range(tlb, pgd, addr, next, details);
+	} while (pgd++, addr = next, addr != end);
+	tlb_end_vma(tlb, vma);
+}
+
+#ifdef CONFIG_PREEMPT
+# define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
+#else
+/* No preempt: go for improved straight-line efficiency */
+# define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
+#endif
+
+/**
+ * unmap_vmas - unmap a range of memory covered by a list of vma's
+ * @tlbp: address of the caller's struct mmu_gather
+ * @mm: the controlling mm_struct
+ * @vma: the starting vma
+ * @start_addr: virtual address at which to start unmapping
+ * @end_addr: virtual address at which to end unmapping
+ * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Returns the number of vma's which were covered by the unmapping.
+ *
+ * Unmap all pages in the vma list.  Called under page_table_lock.
+ *
+ * We aim to not hold page_table_lock for too long (for scheduling latency
+ * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * return the ending mmu_gather to the caller.
+ *
+ * Only addresses between `start' and `end' will be unmapped.
+ *
+ * The VMA list must be sorted in ascending virtual address order.
+ *
+ * unmap_vmas() assumes that the caller will flush the whole unmapped address
+ * range after unmap_vmas() returns.  So the only responsibility here is to
+ * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
+ * drops the lock and schedules.
+ */
+int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long start_addr,
+		unsigned long end_addr, unsigned long *nr_accounted,
+		struct zap_details *details)
+{
+	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
+	int tlb_start_valid = 0;
+	int ret = 0;
+	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
+	int fullmm = tlb_is_full_mm(*tlbp);
+
+	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+		unsigned long start;
+		unsigned long end;
+
+		start = max(vma->vm_start, start_addr);
+		if (start >= vma->vm_end)
+			continue;
+		end = min(vma->vm_end, end_addr);
+		if (end <= vma->vm_start)
+			continue;
+
+		if (vma->vm_flags & VM_ACCOUNT)
+			*nr_accounted += (end - start) >> PAGE_SHIFT;
+
+		ret++;
+		while (start != end) {
+			unsigned long block;
+
+			if (!tlb_start_valid) {
+				tlb_start = start;
+				tlb_start_valid = 1;
+			}
+
+			if (is_vm_hugetlb_page(vma)) {
+				block = end - start;
+				unmap_hugepage_range(vma, start, end);
+			} else {
+				block = min(zap_bytes, end - start);
+				unmap_page_range(*tlbp, vma, start,
+						start + block, details);
+			}
+
+			start += block;
+			zap_bytes -= block;
+			if ((long)zap_bytes > 0)
+				continue;
+
+			tlb_finish_mmu(*tlbp, tlb_start, start);
+
+			if (need_resched() ||
+				need_lockbreak(&mm->page_table_lock) ||
+				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+				if (i_mmap_lock) {
+					/* must reset count of rss freed */
+					*tlbp = tlb_gather_mmu(mm, fullmm);
+					details->break_addr = start;
+					goto out;
+				}
+				spin_unlock(&mm->page_table_lock);
+				cond_resched();
+				spin_lock(&mm->page_table_lock);
+			}
+
+			*tlbp = tlb_gather_mmu(mm, fullmm);
+			tlb_start_valid = 0;
+			zap_bytes = ZAP_BLOCK_SIZE;
+		}
+	}
+out:
+	return ret;
+}
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ */
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+		unsigned long size, struct zap_details *details)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather *tlb;
+	unsigned long end = address + size;
+	unsigned long nr_accounted = 0;
+
+	if (is_vm_hugetlb_page(vma)) {
+		zap_hugepage_range(vma, address, size);
+		return;
+	}
+
+	lru_add_drain();
+	spin_lock(&mm->page_table_lock);
+	tlb = tlb_gather_mmu(mm, 0);
+	unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
+	tlb_finish_mmu(tlb, address, end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+/*
+ * Do a quick page-table lookup for a single page.
+ * mm->page_table_lock must be held.
+ */
+static struct page *
+__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	unsigned long pfn;
+	struct page *page;
+
+	page = follow_huge_addr(mm, address, write);
+	if (! IS_ERR(page))
+		return page;
+
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto out;
+	
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto out;
+	if (pmd_huge(*pmd))
+		return follow_huge_pmd(mm, address, pmd, write);
+
+	ptep = pte_offset_map(pmd, address);
+	if (!ptep)
+		goto out;
+
+	pte = *ptep;
+	pte_unmap(ptep);
+	if (pte_present(pte)) {
+		if (write && !pte_write(pte))
+			goto out;
+		if (read && !pte_read(pte))
+			goto out;
+		pfn = pte_pfn(pte);
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (write && !pte_dirty(pte) && !PageDirty(page))
+				set_page_dirty(page);
+			mark_page_accessed(page);
+			return page;
+		}
+	}
+
+out:
+	return NULL;
+}
+
+struct page *
+follow_page(struct mm_struct *mm, unsigned long address, int write)
+{
+	return __follow_page(mm, address, /*read*/0, write);
+}
+
+int
+check_user_page_readable(struct mm_struct *mm, unsigned long address)
+{
+	return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
+}
+
+EXPORT_SYMBOL(check_user_page_readable);
+
+/* 
+ * Given a physical address, is there a useful struct page pointing to
+ * it?  This may become more complex in the future if we start dealing
+ * with IO-aperture pages for direct-IO.
+ */
+
+static inline struct page *get_page_map(struct page *page)
+{
+	if (!pfn_valid(page_to_pfn(page)))
+		return NULL;
+	return page;
+}
+
+
+static inline int
+untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
+			 unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	/* Check if the vma is for an anonymous mapping. */
+	if (vma->vm_ops && vma->vm_ops->nopage)
+		return 0;
+
+	/* Check if page directory entry exists. */
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		return 1;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		return 1;
+
+	/* Check if page middle directory entry exists. */
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return 1;
+
+	/* There is a pte slot for 'address' in 'mm'. */
+	return 0;
+}
+
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int len, int write, int force,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	int i;
+	unsigned int flags;
+
+	/* 
+	 * Require read or write permissions.
+	 * If 'force' is set, we only require the "MAY" flags.
+	 */
+	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	i = 0;
+
+	do {
+		struct vm_area_struct *	vma;
+
+		vma = find_extend_vma(mm, start);
+		if (!vma && in_gate_area(tsk, start)) {
+			unsigned long pg = start & PAGE_MASK;
+			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+			pte_t *pte;
+			if (write) /* user gate pages are read-only */
+				return i ? : -EFAULT;
+			if (pg > TASK_SIZE)
+				pgd = pgd_offset_k(pg);
+			else
+				pgd = pgd_offset_gate(mm, pg);
+			BUG_ON(pgd_none(*pgd));
+			pud = pud_offset(pgd, pg);
+			BUG_ON(pud_none(*pud));
+			pmd = pmd_offset(pud, pg);
+			BUG_ON(pmd_none(*pmd));
+			pte = pte_offset_map(pmd, pg);
+			BUG_ON(pte_none(*pte));
+			if (pages) {
+				pages[i] = pte_page(*pte);
+				get_page(pages[i]);
+			}
+			pte_unmap(pte);
+			if (vmas)
+				vmas[i] = gate_vma;
+			i++;
+			start += PAGE_SIZE;
+			len--;
+			continue;
+		}
+
+		if (!vma || (vma->vm_flags & VM_IO)
+				|| !(flags & vma->vm_flags))
+			return i ? : -EFAULT;
+
+		if (is_vm_hugetlb_page(vma)) {
+			i = follow_hugetlb_page(mm, vma, pages, vmas,
+						&start, &len, i);
+			continue;
+		}
+		spin_lock(&mm->page_table_lock);
+		do {
+			struct page *map;
+			int lookup_write = write;
+
+			cond_resched_lock(&mm->page_table_lock);
+			while (!(map = follow_page(mm, start, lookup_write))) {
+				/*
+				 * Shortcut for anonymous pages. We don't want
+				 * to force the creation of pages tables for
+				 * insanly big anonymously mapped areas that
+				 * nobody touched so far. This is important
+				 * for doing a core dump for these mappings.
+				 */
+				if (!lookup_write &&
+				    untouched_anonymous_page(mm,vma,start)) {
+					map = ZERO_PAGE(start);
+					break;
+				}
+				spin_unlock(&mm->page_table_lock);
+				switch (handle_mm_fault(mm,vma,start,write)) {
+				case VM_FAULT_MINOR:
+					tsk->min_flt++;
+					break;
+				case VM_FAULT_MAJOR:
+					tsk->maj_flt++;
+					break;
+				case VM_FAULT_SIGBUS:
+					return i ? i : -EFAULT;
+				case VM_FAULT_OOM:
+					return i ? i : -ENOMEM;
+				default:
+					BUG();
+				}
+				/*
+				 * Now that we have performed a write fault
+				 * and surely no longer have a shared page we
+				 * shouldn't write, we shouldn't ignore an
+				 * unwritable page in the page table if
+				 * we are forcing write access.
+				 */
+				lookup_write = write && !force;
+				spin_lock(&mm->page_table_lock);
+			}
+			if (pages) {
+				pages[i] = get_page_map(map);
+				if (!pages[i]) {
+					spin_unlock(&mm->page_table_lock);
+					while (i--)
+						page_cache_release(pages[i]);
+					i = -EFAULT;
+					goto out;
+				}
+				flush_dcache_page(pages[i]);
+				if (!PageReserved(pages[i]))
+					page_cache_get(pages[i]);
+			}
+			if (vmas)
+				vmas[i] = vma;
+			i++;
+			start += PAGE_SIZE;
+			len--;
+		} while(len && start < vma->vm_end);
+		spin_unlock(&mm->page_table_lock);
+	} while(len);
+out:
+	return i;
+}
+
+EXPORT_SYMBOL(get_user_pages);
+
+static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pte_t *pte;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(mm, addr, pte, zero_pte);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+	return 0;
+}
+
+static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (zeromap_pte_range(mm, pmd, addr, next, prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+			unsigned long addr, unsigned long end, pgprot_t prot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (zeromap_pmd_range(mm, pud, addr, next, prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+int zeromap_page_range(struct vm_area_struct *vma,
+			unsigned long addr, unsigned long size, pgprot_t prot)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	struct mm_struct *mm = vma->vm_mm;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = zeromap_pud_range(mm, pgd, addr, next, prot);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+
+/*
+ * maps a range of physical memory into the requested pages. the old
+ * mappings are removed. any references to nonexistent pages results
+ * in null mappings (currently treated as "copy-on-access")
+ */
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pte_t *pte;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
+			set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+	return 0;
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pfn -= addr >> PAGE_SHIFT;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (remap_pte_range(mm, pmd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pfn -= addr >> PAGE_SHIFT;
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (remap_pmd_range(mm, pud, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+/*  Note: this is only safe if the mm semaphore is held when called. */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	struct mm_struct *mm = vma->vm_mm;
+	int err;
+
+	/*
+	 * Physically remapped pages are special. Tell the
+	 * rest of the world about it:
+	 *   VM_IO tells people not to look at these pages
+	 *	(accesses can have side effects).
+	 *   VM_RESERVED tells swapout not to try to touch
+	 *	this region.
+	 */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
+	BUG_ON(addr >= end);
+	pfn -= addr >> PAGE_SHIFT;
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = remap_pud_range(mm, pgd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+/*
+ * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
+ * servicing faults for write access.  In the normal case, do always want
+ * pte_mkwrite.  But get_user_pages can cause write faults for mappings
+ * that do not have writing enabled, when used by access_process_vm.
+ */
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pte = pte_mkwrite(pte);
+	return pte;
+}
+
+/*
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
+static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
+		pte_t *page_table)
+{
+	pte_t entry;
+
+	entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
+			      vma);
+	ptep_establish(vma, address, page_table, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Goto-purists beware: the only reason for goto's here is that it results
+ * in better assembly code.. The "default" path will see no jumps at all.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We hold the mm semaphore and the page_table_lock on entry and exit
+ * with the page_table_lock released.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
+	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+{
+	struct page *old_page, *new_page;
+	unsigned long pfn = pte_pfn(pte);
+	pte_t entry;
+
+	if (unlikely(!pfn_valid(pfn))) {
+		/*
+		 * This should really halt the system so it can be debugged or
+		 * at least the kernel stops what it's doing before it corrupts
+		 * data, but for the moment just pretend this is OOM.
+		 */
+		pte_unmap(page_table);
+		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
+				address);
+		spin_unlock(&mm->page_table_lock);
+		return VM_FAULT_OOM;
+	}
+	old_page = pfn_to_page(pfn);
+
+	if (!TestSetPageLocked(old_page)) {
+		int reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+		if (reuse) {
+			flush_cache_page(vma, address, pfn);
+			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
+					      vma);
+			ptep_set_access_flags(vma, address, page_table, entry, 1);
+			update_mmu_cache(vma, address, entry);
+			lazy_mmu_prot_update(entry);
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			return VM_FAULT_MINOR;
+		}
+	}
+	pte_unmap(page_table);
+
+	/*
+	 * Ok, we need to copy. Oh, well..
+	 */
+	if (!PageReserved(old_page))
+		page_cache_get(old_page);
+	spin_unlock(&mm->page_table_lock);
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto no_new_page;
+	if (old_page == ZERO_PAGE(address)) {
+		new_page = alloc_zeroed_user_highpage(vma, address);
+		if (!new_page)
+			goto no_new_page;
+	} else {
+		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+		if (!new_page)
+			goto no_new_page;
+		copy_user_highpage(new_page, old_page, address);
+	}
+	/*
+	 * Re-check the pte - we dropped the lock
+	 */
+	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+	if (likely(pte_same(*page_table, pte))) {
+		if (PageAnon(old_page))
+			dec_mm_counter(mm, anon_rss);
+		if (PageReserved(old_page))
+			inc_mm_counter(mm, rss);
+		else
+			page_remove_rmap(old_page);
+		flush_cache_page(vma, address, pfn);
+		break_cow(vma, new_page, address, page_table);
+		lru_cache_add_active(new_page);
+		page_add_anon_rmap(new_page, vma, address);
+
+		/* Free the old page.. */
+		new_page = old_page;
+	}
+	pte_unmap(page_table);
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	spin_unlock(&mm->page_table_lock);
+	return VM_FAULT_MINOR;
+
+no_new_page:
+	page_cache_release(old_page);
+	return VM_FAULT_OOM;
+}
+
+/*
+ * Helper functions for unmap_mapping_range().
+ *
+ * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
+ *
+ * We have to restart searching the prio_tree whenever we drop the lock,
+ * since the iterator is only valid while the lock is held, and anyway
+ * a later vma might be split and reinserted earlier while lock dropped.
+ *
+ * The list of nonlinear vmas could be handled more efficiently, using
+ * a placeholder, but handle it in the same way until a need is shown.
+ * It is important to search the prio_tree before nonlinear list: a vma
+ * may become nonlinear and be shifted from prio_tree to nonlinear list
+ * while the lock is dropped; but never shifted from list to prio_tree.
+ *
+ * In order to make forward progress despite restarting the search,
+ * vm_truncate_count is used to mark a vma as now dealt with, so we can
+ * quickly skip it next time around.  Since the prio_tree search only
+ * shows us those vmas affected by unmapping the range in question, we
+ * can't efficiently keep all vmas in step with mapping->truncate_count:
+ * so instead reset them all whenever it wraps back to 0 (then go to 1).
+ * mapping->truncate_count and vma->vm_truncate_count are protected by
+ * i_mmap_lock.
+ *
+ * In order to make forward progress despite repeatedly restarting some
+ * large vma, note the break_addr set by unmap_vmas when it breaks out:
+ * and restart from that address when we reach that vma again.  It might
+ * have been split or merged, shrunk or extended, but never shifted: so
+ * restart_addr remains valid so long as it remains in the vma's range.
+ * unmap_mapping_range forces truncate_count to leap over page-aligned
+ * values so we can save vma's restart_addr in its truncate_count field.
+ */
+#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
+
+static void reset_vma_truncate_counts(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		vma->vm_truncate_count = 0;
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		vma->vm_truncate_count = 0;
+}
+
+static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+		unsigned long start_addr, unsigned long end_addr,
+		struct zap_details *details)
+{
+	unsigned long restart_addr;
+	int need_break;
+
+again:
+	restart_addr = vma->vm_truncate_count;
+	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
+		start_addr = restart_addr;
+		if (start_addr >= end_addr) {
+			/* Top of vma has been split off since last time */
+			vma->vm_truncate_count = details->truncate_count;
+			return 0;
+		}
+	}
+
+	details->break_addr = end_addr;
+	zap_page_range(vma, start_addr, end_addr - start_addr, details);
+
+	/*
+	 * We cannot rely on the break test in unmap_vmas:
+	 * on the one hand, we don't want to restart our loop
+	 * just because that broke out for the page_table_lock;
+	 * on the other hand, it does no test when vma is small.
+	 */
+	need_break = need_resched() ||
+			need_lockbreak(details->i_mmap_lock);
+
+	if (details->break_addr >= end_addr) {
+		/* We have now completed this vma: mark it so */
+		vma->vm_truncate_count = details->truncate_count;
+		if (!need_break)
+			return 0;
+	} else {
+		/* Note restart_addr in vma's truncate_count field */
+		vma->vm_truncate_count = details->break_addr;
+		if (!need_break)
+			goto again;
+	}
+
+	spin_unlock(details->i_mmap_lock);
+	cond_resched();
+	spin_lock(details->i_mmap_lock);
+	return -EINTR;
+}
+
+static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+					    struct zap_details *details)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	pgoff_t vba, vea, zba, zea;
+
+restart:
+	vma_prio_tree_foreach(vma, &iter, root,
+			details->first_index, details->last_index) {
+		/* Skip quickly over those we have already dealt with */
+		if (vma->vm_truncate_count == details->truncate_count)
+			continue;
+
+		vba = vma->vm_pgoff;
+		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
+		zba = details->first_index;
+		if (zba < vba)
+			zba = vba;
+		zea = details->last_index;
+		if (zea > vea)
+			zea = vea;
+
+		if (unmap_mapping_range_vma(vma,
+			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
+			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
+				details) < 0)
+			goto restart;
+	}
+}
+
+static inline void unmap_mapping_range_list(struct list_head *head,
+					    struct zap_details *details)
+{
+	struct vm_area_struct *vma;
+
+	/*
+	 * In nonlinear VMAs there is no correspondence between virtual address
+	 * offset and file offset.  So we must perform an exhaustive search
+	 * across *all* the pages in each nonlinear VMA, not just the pages
+	 * whose virtual address lies outside the file truncation point.
+	 */
+restart:
+	list_for_each_entry(vma, head, shared.vm_set.list) {
+		/* Skip quickly over those we have already dealt with */
+		if (vma->vm_truncate_count == details->truncate_count)
+			continue;
+		details->nonlinear_vma = vma;
+		if (unmap_mapping_range_vma(vma, vma->vm_start,
+					vma->vm_end, details) < 0)
+			goto restart;
+	}
+}
+
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps
+ * in the specified address_space corresponding to the specified
+ * page range in the underlying file.
+ * @address_space: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file.  This will be rounded down to a PAGE_SIZE
+ * boundary.  Note that this is different from vmtruncate(), which
+ * must keep the partial page.  In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes.  This will be rounded
+ * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+		loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+	struct zap_details details;
+	pgoff_t hba = holebegin >> PAGE_SHIFT;
+	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* Check for overflow. */
+	if (sizeof(holelen) > sizeof(hlen)) {
+		long long holeend =
+			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (holeend & ~(long long)ULONG_MAX)
+			hlen = ULONG_MAX - hba + 1;
+	}
+
+	details.check_mapping = even_cows? NULL: mapping;
+	details.nonlinear_vma = NULL;
+	details.first_index = hba;
+	details.last_index = hba + hlen - 1;
+	if (details.last_index < details.first_index)
+		details.last_index = ULONG_MAX;
+	details.i_mmap_lock = &mapping->i_mmap_lock;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	/* serialize i_size write against truncate_count write */
+	smp_wmb();
+	/* Protect against page faults, and endless unmapping loops */
+	mapping->truncate_count++;
+	/*
+	 * For archs where spin_lock has inclusive semantics like ia64
+	 * this smp_mb() will prevent to read pagetable contents
+	 * before the truncate_count increment is visible to
+	 * other cpus.
+	 */
+	smp_mb();
+	if (unlikely(is_restart_addr(mapping->truncate_count))) {
+		if (mapping->truncate_count == 0)
+			reset_vma_truncate_counts(mapping);
+		mapping->truncate_count++;
+	}
+	details.truncate_count = mapping->truncate_count;
+
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
+		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
+	spin_unlock(&mapping->i_mmap_lock);
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode * inode, loff_t offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long limit;
+
+	if (inode->i_size < offset)
+		goto do_expand;
+	/*
+	 * truncation of in-use swapfiles is disallowed - it would cause
+	 * subsequent swapout to scribble on the now-freed blocks.
+	 */
+	if (IS_SWAPFILE(inode))
+		goto out_busy;
+	i_size_write(inode, offset);
+	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+	truncate_inode_pages(mapping, offset);
+	goto out_truncate;
+
+do_expand:
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && offset > limit)
+		goto out_sig;
+	if (offset > inode->i_sb->s_maxbytes)
+		goto out_big;
+	i_size_write(inode, offset);
+
+out_truncate:
+	if (inode->i_op && inode->i_op->truncate)
+		inode->i_op->truncate(inode);
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+out_busy:
+	return -ETXTBSY;
+}
+
+EXPORT_SYMBOL(vmtruncate);
+
+/* 
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...  
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
+{
+#ifdef CONFIG_NUMA
+	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+#endif
+	int i, num;
+	struct page *new_page;
+	unsigned long offset;
+
+	/*
+	 * Get the number of handles we should do readahead io to.
+	 */
+	num = valid_swaphandles(entry, &offset);
+	for (i = 0; i < num; offset++, i++) {
+		/* Ok, do the async read-ahead now */
+		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
+							   offset), vma, addr);
+		if (!new_page)
+			break;
+		page_cache_release(new_page);
+#ifdef CONFIG_NUMA
+		/*
+		 * Find the next applicable VMA for the NUMA policy.
+		 */
+		addr += PAGE_SIZE;
+		if (addr == 0)
+			vma = NULL;
+		if (vma) {
+			if (addr >= vma->vm_end) {
+				vma = next_vma;
+				next_vma = vma ? vma->vm_next : NULL;
+			}
+			if (vma && addr < vma->vm_start)
+				vma = NULL;
+		} else {
+			if (next_vma && addr >= next_vma->vm_start) {
+				vma = next_vma;
+				next_vma = vma->vm_next;
+			}
+		}
+#endif
+	}
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+}
+
+/*
+ * We hold the mm semaphore and the page_table_lock on entry and
+ * should release the pagetable lock on exit..
+ */
+static int do_swap_page(struct mm_struct * mm,
+	struct vm_area_struct * vma, unsigned long address,
+	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+{
+	struct page *page;
+	swp_entry_t entry = pte_to_swp_entry(orig_pte);
+	pte_t pte;
+	int ret = VM_FAULT_MINOR;
+
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
+	page = lookup_swap_cache(entry);
+	if (!page) {
+ 		swapin_readahead(entry, address, vma);
+ 		page = read_swap_cache_async(entry, vma, address);
+		if (!page) {
+			/*
+			 * Back out if somebody else faulted in this pte while
+			 * we released the page table lock.
+			 */
+			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
+			if (likely(pte_same(*page_table, orig_pte)))
+				ret = VM_FAULT_OOM;
+			else
+				ret = VM_FAULT_MINOR;
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+
+		/* Had to read the page from swap area: Major fault */
+		ret = VM_FAULT_MAJOR;
+		inc_page_state(pgmajfault);
+		grab_swap_token();
+	}
+
+	mark_page_accessed(page);
+	lock_page(page);
+
+	/*
+	 * Back out if somebody else faulted in this pte while we
+	 * released the page table lock.
+	 */
+	spin_lock(&mm->page_table_lock);
+	page_table = pte_offset_map(pmd, address);
+	if (unlikely(!pte_same(*page_table, orig_pte))) {
+		pte_unmap(page_table);
+		spin_unlock(&mm->page_table_lock);
+		unlock_page(page);
+		page_cache_release(page);
+		ret = VM_FAULT_MINOR;
+		goto out;
+	}
+
+	/* The page isn't present yet, go ahead with the fault. */
+		
+	swap_free(entry);
+	if (vm_swap_full())
+		remove_exclusive_swap_page(page);
+
+	inc_mm_counter(mm, rss);
+	pte = mk_pte(page, vma->vm_page_prot);
+	if (write_access && can_share_swap_page(page)) {
+		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+		write_access = 0;
+	}
+	unlock_page(page);
+
+	flush_icache_page(vma, page);
+	set_pte_at(mm, address, page_table, pte);
+	page_add_anon_rmap(page, vma, address);
+
+	if (write_access) {
+		if (do_wp_page(mm, vma, address,
+				page_table, pmd, pte) == VM_FAULT_OOM)
+			ret = VM_FAULT_OOM;
+		goto out;
+	}
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, pte);
+	lazy_mmu_prot_update(pte);
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
+out:
+	return ret;
+}
+
+/*
+ * We are called with the MM semaphore and page_table_lock
+ * spinlock held to protect against concurrent faults in
+ * multithreaded programs. 
+ */
+static int
+do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		pte_t *page_table, pmd_t *pmd, int write_access,
+		unsigned long addr)
+{
+	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
+
+	/* Read-only mapping of ZERO_PAGE. */
+	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+
+	/* ..except if it's a write access */
+	if (write_access) {
+		/* Allocate our own private page. */
+		pte_unmap(page_table);
+		spin_unlock(&mm->page_table_lock);
+
+		if (unlikely(anon_vma_prepare(vma)))
+			goto no_mem;
+		page = alloc_zeroed_user_highpage(vma, addr);
+		if (!page)
+			goto no_mem;
+
+		spin_lock(&mm->page_table_lock);
+		page_table = pte_offset_map(pmd, addr);
+
+		if (!pte_none(*page_table)) {
+			pte_unmap(page_table);
+			page_cache_release(page);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+		inc_mm_counter(mm, rss);
+		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
+							 vma->vm_page_prot)),
+				      vma);
+		lru_cache_add_active(page);
+		SetPageReferenced(page);
+		page_add_anon_rmap(page, vma, addr);
+	}
+
+	set_pte_at(mm, addr, page_table, entry);
+	pte_unmap(page_table);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, entry);
+	lazy_mmu_prot_update(entry);
+	spin_unlock(&mm->page_table_lock);
+out:
+	return VM_FAULT_MINOR;
+no_mem:
+	return VM_FAULT_OOM;
+}
+
+/*
+ * do_no_page() tries to create a new page mapping. It aggressively
+ * tries to share with existing pages, but makes a separate copy if
+ * the "write_access" parameter is true in order to avoid the next
+ * page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * This is called with the MM semaphore held and the page table
+ * spinlock held. Exit with the spinlock released.
+ */
+static int
+do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+{
+	struct page * new_page;
+	struct address_space *mapping = NULL;
+	pte_t entry;
+	unsigned int sequence = 0;
+	int ret = VM_FAULT_MINOR;
+	int anon = 0;
+
+	if (!vma->vm_ops || !vma->vm_ops->nopage)
+		return do_anonymous_page(mm, vma, page_table,
+					pmd, write_access, address);
+	pte_unmap(page_table);
+	spin_unlock(&mm->page_table_lock);
+
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+		sequence = mapping->truncate_count;
+		smp_rmb(); /* serializes i_size against truncate_count */
+	}
+retry:
+	cond_resched();
+	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+	/*
+	 * No smp_rmb is needed here as long as there's a full
+	 * spin_lock/unlock sequence inside the ->nopage callback
+	 * (for the pagecache lookup) that acts as an implicit
+	 * smp_mb() and prevents the i_size read to happen
+	 * after the next truncate_count read.
+	 */
+
+	/* no page was available -- either SIGBUS or OOM */
+	if (new_page == NOPAGE_SIGBUS)
+		return VM_FAULT_SIGBUS;
+	if (new_page == NOPAGE_OOM)
+		return VM_FAULT_OOM;
+
+	/*
+	 * Should we do an early C-O-W break?
+	 */
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		struct page *page;
+
+		if (unlikely(anon_vma_prepare(vma)))
+			goto oom;
+		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+		if (!page)
+			goto oom;
+		copy_user_highpage(page, new_page, address);
+		page_cache_release(new_page);
+		new_page = page;
+		anon = 1;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	/*
+	 * For a file-backed vma, someone could have truncated or otherwise
+	 * invalidated this page.  If unmap_mapping_range got called,
+	 * retry getting the page.
+	 */
+	if (mapping && unlikely(sequence != mapping->truncate_count)) {
+		sequence = mapping->truncate_count;
+		spin_unlock(&mm->page_table_lock);
+		page_cache_release(new_page);
+		goto retry;
+	}
+	page_table = pte_offset_map(pmd, address);
+
+	/*
+	 * This silly early PAGE_DIRTY setting removes a race
+	 * due to the bad i386 page protection. But it's valid
+	 * for other architectures too.
+	 *
+	 * Note that if write_access is true, we either now have
+	 * an exclusive copy of the page, or this is a shared mapping,
+	 * so we can make it writable and dirty to avoid having to
+	 * handle that later.
+	 */
+	/* Only go through if we didn't race with anybody else... */
+	if (pte_none(*page_table)) {
+		if (!PageReserved(new_page))
+			inc_mm_counter(mm, rss);
+
+		flush_icache_page(vma, new_page);
+		entry = mk_pte(new_page, vma->vm_page_prot);
+		if (write_access)
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		set_pte_at(mm, address, page_table, entry);
+		if (anon) {
+			lru_cache_add_active(new_page);
+			page_add_anon_rmap(new_page, vma, address);
+		} else
+			page_add_file_rmap(new_page);
+		pte_unmap(page_table);
+	} else {
+		/* One of our sibling threads was faster, back out. */
+		pte_unmap(page_table);
+		page_cache_release(new_page);
+		spin_unlock(&mm->page_table_lock);
+		goto out;
+	}
+
+	/* no need to invalidate: a not-present page shouldn't be cached */
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	spin_unlock(&mm->page_table_lock);
+out:
+	return ret;
+oom:
+	page_cache_release(new_page);
+	ret = VM_FAULT_OOM;
+	goto out;
+}
+
+/*
+ * Fault of a previously existing named mapping. Repopulate the pte
+ * from the encoded file_pte if possible. This enables swappable
+ * nonlinear vmas.
+ */
+static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+{
+	unsigned long pgoff;
+	int err;
+
+	BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
+	/*
+	 * Fall back to the linear mapping if the fs does not support
+	 * ->populate:
+	 */
+	if (!vma->vm_ops || !vma->vm_ops->populate || 
+			(write_access && !(vma->vm_flags & VM_SHARED))) {
+		pte_clear(mm, address, pte);
+		return do_no_page(mm, vma, address, write_access, pte, pmd);
+	}
+
+	pgoff = pte_to_pgoff(*pte);
+
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
+
+	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return VM_FAULT_MAJOR;
+}
+
+/*
+ * These routines also need to handle stuff like marking pages dirty
+ * and/or accessed for architectures that don't do it in hardware (most
+ * RISC architectures).  The early dirtying is also good on the i386.
+ *
+ * There is also a hook called "update_mmu_cache()" that architectures
+ * with external mmu caches can use to update those (ie the Sparc or
+ * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
+ *
+ * We enter with the pagetable spinlock held, we are supposed to
+ * release it when done.
+ */
+static inline int handle_pte_fault(struct mm_struct *mm,
+	struct vm_area_struct * vma, unsigned long address,
+	int write_access, pte_t *pte, pmd_t *pmd)
+{
+	pte_t entry;
+
+	entry = *pte;
+	if (!pte_present(entry)) {
+		/*
+		 * If it truly wasn't present, we know that kswapd
+		 * and the PTE updates will not touch it later. So
+		 * drop the lock.
+		 */
+		if (pte_none(entry))
+			return do_no_page(mm, vma, address, write_access, pte, pmd);
+		if (pte_file(entry))
+			return do_file_page(mm, vma, address, write_access, pte, pmd);
+		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+	}
+
+	if (write_access) {
+		if (!pte_write(entry))
+			return do_wp_page(mm, vma, address, pte, pmd, entry);
+
+		entry = pte_mkdirty(entry);
+	}
+	entry = pte_mkyoung(entry);
+	ptep_set_access_flags(vma, address, pte, entry, write_access);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	pte_unmap(pte);
+	spin_unlock(&mm->page_table_lock);
+	return VM_FAULT_MINOR;
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+		unsigned long address, int write_access)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	__set_current_state(TASK_RUNNING);
+
+	inc_page_state(pgfault);
+
+	if (is_vm_hugetlb_page(vma))
+		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
+
+	/*
+	 * We need the page table lock to synchronize with kswapd
+	 * and the SMP-safe atomic PTE updates.
+	 */
+	pgd = pgd_offset(mm, address);
+	spin_lock(&mm->page_table_lock);
+
+	pud = pud_alloc(mm, pgd, address);
+	if (!pud)
+		goto oom;
+
+	pmd = pmd_alloc(mm, pud, address);
+	if (!pmd)
+		goto oom;
+
+	pte = pte_alloc_map(mm, pmd, address);
+	if (!pte)
+		goto oom;
+	
+	return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+
+ oom:
+	spin_unlock(&mm->page_table_lock);
+	return VM_FAULT_OOM;
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+/*
+ * Allocate page upper directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ */
+pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	pud_t *new;
+
+	spin_unlock(&mm->page_table_lock);
+	new = pud_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return NULL;
+
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
+	if (pgd_present(*pgd)) {
+		pud_free(new);
+		goto out;
+	}
+	pgd_populate(mm, pgd, new);
+ out:
+	return pud_offset(pgd, address);
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+/*
+ * Allocate page middle directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ */
+pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	pmd_t *new;
+
+	spin_unlock(&mm->page_table_lock);
+	new = pmd_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (!new)
+		return NULL;
+
+	/*
+	 * Because we dropped the lock, we should re-check the
+	 * entry, as somebody else could have populated it..
+	 */
+#ifndef __ARCH_HAS_4LEVEL_HACK
+	if (pud_present(*pud)) {
+		pmd_free(new);
+		goto out;
+	}
+	pud_populate(mm, pud, new);
+#else
+	if (pgd_present(*pud)) {
+		pmd_free(new);
+		goto out;
+	}
+	pgd_populate(mm, pud, new);
+#endif /* __ARCH_HAS_4LEVEL_HACK */
+
+ out:
+	return pmd_offset(pud, address);
+}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+	int ret, len, write;
+	struct vm_area_struct * vma;
+
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		return -1;
+	write = (vma->vm_flags & VM_WRITE) != 0;
+	if (addr >= end)
+		BUG();
+	if (end > vma->vm_end)
+		BUG();
+	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+	ret = get_user_pages(current, current->mm, addr,
+			len, write, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+	return ret == len ? 0 : -1;
+}
+
+/* 
+ * Map a vmalloc()-space virtual address to the physical page.
+ */
+struct page * vmalloc_to_page(void * vmalloc_addr)
+{
+	unsigned long addr = (unsigned long) vmalloc_addr;
+	struct page *page = NULL;
+	pgd_t *pgd = pgd_offset_k(addr);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+  
+	if (!pgd_none(*pgd)) {
+		pud = pud_offset(pgd, addr);
+		if (!pud_none(*pud)) {
+			pmd = pmd_offset(pud, addr);
+			if (!pmd_none(*pmd)) {
+				ptep = pte_offset_map(pmd, addr);
+				pte = *ptep;
+				if (pte_present(pte))
+					page = pte_page(pte);
+				pte_unmap(ptep);
+			}
+		}
+	}
+	return page;
+}
+
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(void * vmalloc_addr)
+{
+	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+/*
+ * update_mem_hiwater
+ *	- update per process rss and vm high water data
+ */
+void update_mem_hiwater(struct task_struct *tsk)
+{
+	if (tsk->mm) {
+		unsigned long rss = get_mm_counter(tsk->mm, rss);
+
+		if (tsk->mm->hiwater_rss < rss)
+			tsk->mm->hiwater_rss = rss;
+		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
+			tsk->mm->hiwater_vm = tsk->mm->total_vm;
+	}
+}
+
+#if !defined(__HAVE_ARCH_GATE_AREA)
+
+#if defined(AT_SYSINFO_EHDR)
+struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_page_prot = PAGE_READONLY;
+	gate_vma.vm_flags = 0;
+	return 0;
+}
+__initcall(gate_vma_init);
+#endif
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef AT_SYSINFO_EHDR
+	return &gate_vma;
+#else
+	return NULL;
+#endif
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+#ifdef AT_SYSINFO_EHDR
+	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
+		return 1;
+#endif
+	return 0;
+}
+
+#endif	/* __HAVE_ARCH_GATE_AREA */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
new file mode 100644
index 000000000000..a3b44a671cec
--- /dev/null
+++ b/mm/mempolicy.c
@@ -0,0 +1,1138 @@
+/*
+ * Simple NUMA memory policy for the Linux kernel.
+ *
+ * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, version 2.
+ *
+ * NUMA policy allows the user to give hints in which node(s) memory should
+ * be allocated.
+ *
+ * Support four policies per VMA and per process:
+ *
+ * The VMA policy has priority over the process policy for a page fault.
+ *
+ * interleave     Allocate memory interleaved over a set of nodes,
+ *                with normal fallback if it fails.
+ *                For VMA based allocations this interleaves based on the
+ *                offset into the backing object or offset into the mapping
+ *                for anonymous memory. For process policy an process counter
+ *                is used.
+ * bind           Only allocate memory on a specific set of nodes,
+ *                no fallback.
+ * preferred       Try a specific node first before normal fallback.
+ *                As a special case node -1 here means do the allocation
+ *                on the local CPU. This is normally identical to default,
+ *                but useful to set in a VMA when you have a non default
+ *                process policy.
+ * default        Allocate on the local node first, or when on a VMA
+ *                use the process policy. This is what Linux always did
+ *		  in a NUMA aware kernel and still does by, ahem, default.
+ *
+ * The process policy is applied for most non interrupt memory allocations
+ * in that process' context. Interrupts ignore the policies and always
+ * try to allocate on the local CPU. The VMA policy is only applied for memory
+ * allocations for a VMA in the VM.
+ *
+ * Currently there are a few corner cases in swapping where the policy
+ * is not applied, but the majority should be handled. When process policy
+ * is used it is not remembered over swap outs/swap ins.
+ *
+ * Only the highest zone in the zone hierarchy gets policied. Allocations
+ * requesting a lower zone just use default policy. This implies that
+ * on systems with highmem kernel lowmem allocation don't get policied.
+ * Same with GFP_DMA allocations.
+ *
+ * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * all users and remembered even when nobody has memory mapped.
+ */
+
+/* Notebook:
+   fix mmap readahead to honour policy and enable policy for any page cache
+   object
+   statistics for bigpages
+   global policy for page cache? currently it uses process policy. Requires
+   first item above.
+   handle mremap for shared memory (currently ignored for the policy)
+   grows down?
+   make bind policy root only? It can trigger oom much faster and the
+   kernel is not always grateful with that.
+   could replace all the switch()es with a mempolicy_ops structure.
+*/
+
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nodemask.h>
+#include <linux/cpuset.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+
+static kmem_cache_t *policy_cache;
+static kmem_cache_t *sn_cache;
+
+#define PDprintk(fmt...)
+
+/* Highest zone. An specific allocation for a zone below that is not
+   policied. */
+static int policy_zone;
+
+static struct mempolicy default_policy = {
+	.refcnt = ATOMIC_INIT(1), /* never free it */
+	.policy = MPOL_DEFAULT,
+};
+
+/* Check if all specified nodes are online */
+static int nodes_online(unsigned long *nodes)
+{
+	DECLARE_BITMAP(online2, MAX_NUMNODES);
+
+	bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
+	if (bitmap_empty(online2, MAX_NUMNODES))
+		set_bit(0, online2);
+	if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
+		return -EINVAL;
+	return 0;
+}
+
+/* Do sanity checking on a policy */
+static int mpol_check_policy(int mode, unsigned long *nodes)
+{
+	int empty = bitmap_empty(nodes, MAX_NUMNODES);
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		if (!empty)
+			return -EINVAL;
+		break;
+	case MPOL_BIND:
+	case MPOL_INTERLEAVE:
+		/* Preferred will only use the first bit, but allow
+		   more for now. */
+		if (empty)
+			return -EINVAL;
+		break;
+	}
+	return nodes_online(nodes);
+}
+
+/* Copy a node mask from user space. */
+static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
+		     unsigned long maxnode, int mode)
+{
+	unsigned long k;
+	unsigned long nlongs;
+	unsigned long endmask;
+
+	--maxnode;
+	bitmap_zero(nodes, MAX_NUMNODES);
+	if (maxnode == 0 || !nmask)
+		return 0;
+
+	nlongs = BITS_TO_LONGS(maxnode);
+	if ((maxnode % BITS_PER_LONG) == 0)
+		endmask = ~0UL;
+	else
+		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+	/* When the user specified more nodes than supported just check
+	   if the non supported part is all zero. */
+	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+		if (nlongs > PAGE_SIZE/sizeof(long))
+			return -EINVAL;
+		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+			unsigned long t;
+			if (get_user(t,  nmask + k))
+				return -EFAULT;
+			if (k == nlongs - 1) {
+				if (t & endmask)
+					return -EINVAL;
+			} else if (t)
+				return -EINVAL;
+		}
+		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+		endmask = ~0UL;
+	}
+
+	if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
+		return -EFAULT;
+	nodes[nlongs-1] &= endmask;
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes);
+	return mpol_check_policy(mode, nodes);
+}
+
+/* Generate a custom zonelist for the BIND policy. */
+static struct zonelist *bind_zonelist(unsigned long *nodes)
+{
+	struct zonelist *zl;
+	int num, max, nd;
+
+	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+	if (!zl)
+		return NULL;
+	num = 0;
+	for (nd = find_first_bit(nodes, MAX_NUMNODES);
+	     nd < MAX_NUMNODES;
+	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+		int k;
+		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
+			struct zone *z = &NODE_DATA(nd)->node_zones[k];
+			if (!z->present_pages)
+				continue;
+			zl->zones[num++] = z;
+			if (k > policy_zone)
+				policy_zone = k;
+		}
+	}
+	BUG_ON(num >= max);
+	zl->zones[num] = NULL;
+	return zl;
+}
+
+/* Create a new policy */
+static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+{
+	struct mempolicy *policy;
+
+	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+	if (mode == MPOL_DEFAULT)
+		return NULL;
+	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+	if (!policy)
+		return ERR_PTR(-ENOMEM);
+	atomic_set(&policy->refcnt, 1);
+	switch (mode) {
+	case MPOL_INTERLEAVE:
+		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+		break;
+	case MPOL_PREFERRED:
+		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+		if (policy->v.preferred_node >= MAX_NUMNODES)
+			policy->v.preferred_node = -1;
+		break;
+	case MPOL_BIND:
+		policy->v.zonelist = bind_zonelist(nodes);
+		if (policy->v.zonelist == NULL) {
+			kmem_cache_free(policy_cache, policy);
+			return ERR_PTR(-ENOMEM);
+		}
+		break;
+	}
+	policy->policy = mode;
+	return policy;
+}
+
+/* Ensure all existing pages follow the policy. */
+static int
+verify_pages(struct mm_struct *mm,
+	     unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+	while (addr < end) {
+		struct page *p;
+		pte_t *pte;
+		pmd_t *pmd;
+		pud_t *pud;
+		pgd_t *pgd;
+		pgd = pgd_offset(mm, addr);
+		if (pgd_none(*pgd)) {
+			unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
+			if (next > addr)
+				break;
+			addr = next;
+			continue;
+		}
+		pud = pud_offset(pgd, addr);
+		if (pud_none(*pud)) {
+			addr = (addr + PUD_SIZE) & PUD_MASK;
+			continue;
+		}
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd)) {
+			addr = (addr + PMD_SIZE) & PMD_MASK;
+			continue;
+		}
+		p = NULL;
+		pte = pte_offset_map(pmd, addr);
+		if (pte_present(*pte))
+			p = pte_page(*pte);
+		pte_unmap(pte);
+		if (p) {
+			unsigned nid = page_to_nid(p);
+			if (!test_bit(nid, nodes))
+				return -EIO;
+		}
+		addr += PAGE_SIZE;
+	}
+	return 0;
+}
+
+/* Step 1: check the range */
+static struct vm_area_struct *
+check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+	    unsigned long *nodes, unsigned long flags)
+{
+	int err;
+	struct vm_area_struct *first, *vma, *prev;
+
+	first = find_vma(mm, start);
+	if (!first)
+		return ERR_PTR(-EFAULT);
+	prev = NULL;
+	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+		if (!vma->vm_next && vma->vm_end < end)
+			return ERR_PTR(-EFAULT);
+		if (prev && prev->vm_end < vma->vm_start)
+			return ERR_PTR(-EFAULT);
+		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+			err = verify_pages(vma->vm_mm,
+					   vma->vm_start, vma->vm_end, nodes);
+			if (err) {
+				first = ERR_PTR(err);
+				break;
+			}
+		}
+		prev = vma;
+	}
+	return first;
+}
+
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	int err = 0;
+	struct mempolicy *old = vma->vm_policy;
+
+	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+		 vma->vm_ops, vma->vm_file,
+		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy)
+		err = vma->vm_ops->set_policy(vma, new);
+	if (!err) {
+		mpol_get(new);
+		vma->vm_policy = new;
+		mpol_free(old);
+	}
+	return err;
+}
+
+/* Step 2: apply policy to a range and do splits. */
+static int mbind_range(struct vm_area_struct *vma, unsigned long start,
+		       unsigned long end, struct mempolicy *new)
+{
+	struct vm_area_struct *next;
+	int err;
+
+	err = 0;
+	for (; vma && vma->vm_start < end; vma = next) {
+		next = vma->vm_next;
+		if (vma->vm_start < start)
+			err = split_vma(vma->vm_mm, vma, start, 1);
+		if (!err && vma->vm_end > end)
+			err = split_vma(vma->vm_mm, vma, end, 0);
+		if (!err)
+			err = policy_vma(vma, new);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/* Change policy for a memory range */
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+			  unsigned long mode,
+			  unsigned long __user *nmask, unsigned long maxnode,
+			  unsigned flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *new;
+	unsigned long end;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+	int err;
+
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+		return -EINVAL;
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	if (mode == MPOL_DEFAULT)
+		flags &= ~MPOL_MF_STRICT;
+	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
+
+	new = mpol_new(mode, nodes);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+			mode,nodes[0]);
+
+	down_write(&mm->mmap_sem);
+	vma = check_range(mm, start, end, nodes, flags);
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma))
+		err = mbind_range(vma, start, end, new);
+	up_write(&mm->mmap_sem);
+	mpol_free(new);
+	return err;
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+				   unsigned long maxnode)
+{
+	int err;
+	struct mempolicy *new;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+
+	if (mode > MPOL_MAX)
+		return -EINVAL;
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
+	new = mpol_new(mode, nodes);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	mpol_free(current->mempolicy);
+	current->mempolicy = new;
+	if (new && new->policy == MPOL_INTERLEAVE)
+		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+	return 0;
+}
+
+/* Fill a zone bitmap for a policy */
+static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+{
+	int i;
+
+	bitmap_zero(nodes, MAX_NUMNODES);
+	switch (p->policy) {
+	case MPOL_BIND:
+		for (i = 0; p->v.zonelist->zones[i]; i++)
+			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+		break;
+	case MPOL_DEFAULT:
+		break;
+	case MPOL_INTERLEAVE:
+		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+		break;
+	case MPOL_PREFERRED:
+		/* or use current node instead of online map? */
+		if (p->v.preferred_node < 0)
+			bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
+		else
+			__set_bit(p->v.preferred_node, nodes);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
+{
+	struct page *p;
+	int err;
+
+	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+	if (err >= 0) {
+		err = page_to_nid(p);
+		put_page(p);
+	}
+	return err;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+			      void *nodes, unsigned nbytes)
+{
+	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+
+	if (copy > nbytes) {
+		if (copy > PAGE_SIZE)
+			return -EINVAL;
+		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+			return -EFAULT;
+		copy = nbytes;
+	}
+	return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+				  unsigned long __user *nmask,
+				  unsigned long maxnode,
+				  unsigned long addr, unsigned long flags)
+{
+	int err, pval;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = NULL;
+	struct mempolicy *pol = current->mempolicy;
+
+	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+		return -EINVAL;
+	if (nmask != NULL && maxnode < MAX_NUMNODES)
+		return -EINVAL;
+	if (flags & MPOL_F_ADDR) {
+		down_read(&mm->mmap_sem);
+		vma = find_vma_intersection(mm, addr, addr+1);
+		if (!vma) {
+			up_read(&mm->mmap_sem);
+			return -EFAULT;
+		}
+		if (vma->vm_ops && vma->vm_ops->get_policy)
+			pol = vma->vm_ops->get_policy(vma, addr);
+		else
+			pol = vma->vm_policy;
+	} else if (addr)
+		return -EINVAL;
+
+	if (!pol)
+		pol = &default_policy;
+
+	if (flags & MPOL_F_NODE) {
+		if (flags & MPOL_F_ADDR) {
+			err = lookup_node(mm, addr);
+			if (err < 0)
+				goto out;
+			pval = err;
+		} else if (pol == current->mempolicy &&
+				pol->policy == MPOL_INTERLEAVE) {
+			pval = current->il_next;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	} else
+		pval = pol->policy;
+
+	if (vma) {
+		up_read(&current->mm->mmap_sem);
+		vma = NULL;
+	}
+
+	if (policy && put_user(pval, policy))
+		return -EFAULT;
+
+	err = 0;
+	if (nmask) {
+		DECLARE_BITMAP(nodes, MAX_NUMNODES);
+		get_zonemask(pol, nodes);
+		err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
+	}
+
+ out:
+	if (vma)
+		up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage long compat_sys_get_mempolicy(int __user *policy,
+				     compat_ulong_t __user *nmask,
+				     compat_ulong_t maxnode,
+				     compat_ulong_t addr, compat_ulong_t flags)
+{
+	long err;
+	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+	if (nmask)
+		nm = compat_alloc_user_space(alloc_size);
+
+	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+
+	if (!err && nmask) {
+		err = copy_from_user(bm, nm, alloc_size);
+		/* ensure entire bitmap is zeroed */
+		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
+		err |= compat_put_bitmap(nmask, bm, nr_bits);
+	}
+
+	return err;
+}
+
+asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
+				     compat_ulong_t maxnode)
+{
+	long err = 0;
+	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+	if (nmask) {
+		err = compat_get_bitmap(bm, nmask, nr_bits);
+		nm = compat_alloc_user_space(alloc_size);
+		err |= copy_to_user(nm, bm, alloc_size);
+	}
+
+	if (err)
+		return -EFAULT;
+
+	return sys_set_mempolicy(mode, nm, nr_bits+1);
+}
+
+asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
+			     compat_ulong_t mode, compat_ulong_t __user *nmask,
+			     compat_ulong_t maxnode, compat_ulong_t flags)
+{
+	long err = 0;
+	unsigned long __user *nm = NULL;
+	unsigned long nr_bits, alloc_size;
+	DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+	if (nmask) {
+		err = compat_get_bitmap(bm, nmask, nr_bits);
+		nm = compat_alloc_user_space(alloc_size);
+		err |= copy_to_user(nm, bm, alloc_size);
+	}
+
+	if (err)
+		return -EFAULT;
+
+	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+}
+
+#endif
+
+/* Return effective policy for a VMA */
+static struct mempolicy *
+get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = current->mempolicy;
+
+	if (vma) {
+		if (vma->vm_ops && vma->vm_ops->get_policy)
+		        pol = vma->vm_ops->get_policy(vma, addr);
+		else if (vma->vm_policy &&
+				vma->vm_policy->policy != MPOL_DEFAULT)
+			pol = vma->vm_policy;
+	}
+	if (!pol)
+		pol = &default_policy;
+	return pol;
+}
+
+/* Return a zonelist representing a mempolicy */
+static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
+{
+	int nd;
+
+	switch (policy->policy) {
+	case MPOL_PREFERRED:
+		nd = policy->v.preferred_node;
+		if (nd < 0)
+			nd = numa_node_id();
+		break;
+	case MPOL_BIND:
+		/* Lower zones don't get a policy applied */
+		/* Careful: current->mems_allowed might have moved */
+		if (gfp >= policy_zone)
+			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
+				return policy->v.zonelist;
+		/*FALL THROUGH*/
+	case MPOL_INTERLEAVE: /* should not happen */
+	case MPOL_DEFAULT:
+		nd = numa_node_id();
+		break;
+	default:
+		nd = 0;
+		BUG();
+	}
+	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
+}
+
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy)
+{
+	unsigned nid, next;
+	struct task_struct *me = current;
+
+	nid = me->il_next;
+	BUG_ON(nid >= MAX_NUMNODES);
+	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+	if (next >= MAX_NUMNODES)
+		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+	me->il_next = next;
+	return nid;
+}
+
+/* Do static interleaving for a VMA with known offset. */
+static unsigned offset_il_node(struct mempolicy *pol,
+		struct vm_area_struct *vma, unsigned long off)
+{
+	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+	unsigned target = (unsigned)off % nnodes;
+	int c;
+	int nid = -1;
+
+	c = 0;
+	do {
+		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+		c++;
+	} while (c <= target);
+	BUG_ON(nid >= MAX_NUMNODES);
+	BUG_ON(!test_bit(nid, pol->v.nodes));
+	return nid;
+}
+
+/* Allocate a page in interleaved policy.
+   Own path because it needs to do special accounting. */
+static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
+{
+	struct zonelist *zl;
+	struct page *page;
+
+	BUG_ON(!node_online(nid));
+	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+	page = __alloc_pages(gfp, order, zl);
+	if (page && page_zone(page) == zl->zones[0]) {
+		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+		put_cpu();
+	}
+	return page;
+}
+
+/**
+ * 	alloc_page_vma	- Allocate a page for a VMA.
+ *
+ * 	@gfp:
+ *      %GFP_USER    user allocation.
+ *      %GFP_KERNEL  kernel allocations,
+ *      %GFP_HIGHMEM highmem/user allocations,
+ *      %GFP_FS      allocation should not call back into a file system.
+ *      %GFP_ATOMIC  don't sleep.
+ *
+ * 	@vma:  Pointer to VMA or NULL if not available.
+ *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ *
+ * 	This function allocates a page from the kernel page pool and applies
+ *	a NUMA policy associated with the VMA or the current process.
+ *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ *	mm_struct of the VMA to prevent it from going away. Should be used for
+ *	all allocations for pages that will be mapped into
+ * 	user space. Returns NULL when no page can be allocated.
+ *
+ *	Should be called with the mm_sem of the vma hold.
+ */
+struct page *
+alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	cpuset_update_current_mems_allowed();
+
+	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+		unsigned nid;
+		if (vma) {
+			unsigned long off;
+			BUG_ON(addr >= vma->vm_end);
+			BUG_ON(addr < vma->vm_start);
+			off = vma->vm_pgoff;
+			off += (addr - vma->vm_start) >> PAGE_SHIFT;
+			nid = offset_il_node(pol, vma, off);
+		} else {
+			/* fall back to process interleaving */
+			nid = interleave_nodes(pol);
+		}
+		return alloc_page_interleave(gfp, 0, nid);
+	}
+	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+}
+
+/**
+ * 	alloc_pages_current - Allocate pages.
+ *
+ *	@gfp:
+ *		%GFP_USER   user allocation,
+ *      	%GFP_KERNEL kernel allocation,
+ *      	%GFP_HIGHMEM highmem allocation,
+ *      	%GFP_FS     don't call back into a file system.
+ *      	%GFP_ATOMIC don't sleep.
+ *	@order: Power of two of allocation size in pages. 0 is a single page.
+ *
+ *	Allocate a page from the kernel page pool.  When not in
+ *	interrupt context and apply the current process NUMA policy.
+ *	Returns NULL when no page can be allocated.
+ *
+ *	Don't call cpuset_update_current_mems_allowed() unless
+ *	1) it's ok to take cpuset_sem (can WAIT), and
+ *	2) allocating for current task (not interrupt).
+ */
+struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
+{
+	struct mempolicy *pol = current->mempolicy;
+
+	if ((gfp & __GFP_WAIT) && !in_interrupt())
+		cpuset_update_current_mems_allowed();
+	if (!pol || in_interrupt())
+		pol = &default_policy;
+	if (pol->policy == MPOL_INTERLEAVE)
+		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
+	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+}
+EXPORT_SYMBOL(alloc_pages_current);
+
+/* Slow path of a mempolicy copy */
+struct mempolicy *__mpol_copy(struct mempolicy *old)
+{
+	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+	*new = *old;
+	atomic_set(&new->refcnt, 1);
+	if (new->policy == MPOL_BIND) {
+		int sz = ksize(old->v.zonelist);
+		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+		if (!new->v.zonelist) {
+			kmem_cache_free(policy_cache, new);
+			return ERR_PTR(-ENOMEM);
+		}
+		memcpy(new->v.zonelist, old->v.zonelist, sz);
+	}
+	return new;
+}
+
+/* Slow path of a mempolicy comparison */
+int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+	if (!a || !b)
+		return 0;
+	if (a->policy != b->policy)
+		return 0;
+	switch (a->policy) {
+	case MPOL_DEFAULT:
+		return 1;
+	case MPOL_INTERLEAVE:
+		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+	case MPOL_PREFERRED:
+		return a->v.preferred_node == b->v.preferred_node;
+	case MPOL_BIND: {
+		int i;
+		for (i = 0; a->v.zonelist->zones[i]; i++)
+			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
+				return 0;
+		return b->v.zonelist->zones[i] == NULL;
+	}
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+/* Slow path of a mpol destructor. */
+void __mpol_free(struct mempolicy *p)
+{
+	if (!atomic_dec_and_test(&p->refcnt))
+		return;
+	if (p->policy == MPOL_BIND)
+		kfree(p->v.zonelist);
+	p->policy = MPOL_DEFAULT;
+	kmem_cache_free(policy_cache, p);
+}
+
+/*
+ * Hugetlb policy. Same as above, just works with node numbers instead of
+ * zonelists.
+ */
+
+/* Find first node suitable for an allocation */
+int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	switch (pol->policy) {
+	case MPOL_DEFAULT:
+		return numa_node_id();
+	case MPOL_BIND:
+		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
+	case MPOL_INTERLEAVE:
+		return interleave_nodes(pol);
+	case MPOL_PREFERRED:
+		return pol->v.preferred_node >= 0 ?
+				pol->v.preferred_node : numa_node_id();
+	}
+	BUG();
+	return 0;
+}
+
+/* Find secondary valid nodes for an allocation */
+int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	switch (pol->policy) {
+	case MPOL_PREFERRED:
+	case MPOL_DEFAULT:
+	case MPOL_INTERLEAVE:
+		return 1;
+	case MPOL_BIND: {
+		struct zone **z;
+		for (z = pol->v.zonelist->zones; *z; z++)
+			if ((*z)->zone_pgdat->node_id == nid)
+				return 1;
+		return 0;
+	}
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+/*
+ * Shared memory backing store policy support.
+ *
+ * Remember policies even when nobody has shared memory mapped.
+ * The policies are kept in Red-Black tree linked from the inode.
+ * They are protected by the sp->lock spinlock, which should be held
+ * for any accesses to the tree.
+ */
+
+/* lookup first element intersecting start-end */
+/* Caller holds sp->lock */
+static struct sp_node *
+sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+{
+	struct rb_node *n = sp->root.rb_node;
+
+	while (n) {
+		struct sp_node *p = rb_entry(n, struct sp_node, nd);
+
+		if (start >= p->end)
+			n = n->rb_right;
+		else if (end <= p->start)
+			n = n->rb_left;
+		else
+			break;
+	}
+	if (!n)
+		return NULL;
+	for (;;) {
+		struct sp_node *w = NULL;
+		struct rb_node *prev = rb_prev(n);
+		if (!prev)
+			break;
+		w = rb_entry(prev, struct sp_node, nd);
+		if (w->end <= start)
+			break;
+		n = prev;
+	}
+	return rb_entry(n, struct sp_node, nd);
+}
+
+/* Insert a new shared policy into the list. */
+/* Caller holds sp->lock */
+static void sp_insert(struct shared_policy *sp, struct sp_node *new)
+{
+	struct rb_node **p = &sp->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct sp_node *nd;
+
+	while (*p) {
+		parent = *p;
+		nd = rb_entry(parent, struct sp_node, nd);
+		if (new->start < nd->start)
+			p = &(*p)->rb_left;
+		else if (new->end > nd->end)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+	rb_link_node(&new->nd, parent, p);
+	rb_insert_color(&new->nd, &sp->root);
+	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+		 new->policy ? new->policy->policy : 0);
+}
+
+/* Find shared policy intersecting idx */
+struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	struct mempolicy *pol = NULL;
+	struct sp_node *sn;
+
+	if (!sp->root.rb_node)
+		return NULL;
+	spin_lock(&sp->lock);
+	sn = sp_lookup(sp, idx, idx+1);
+	if (sn) {
+		mpol_get(sn->policy);
+		pol = sn->policy;
+	}
+	spin_unlock(&sp->lock);
+	return pol;
+}
+
+static void sp_delete(struct shared_policy *sp, struct sp_node *n)
+{
+	PDprintk("deleting %lx-l%x\n", n->start, n->end);
+	rb_erase(&n->nd, &sp->root);
+	mpol_free(n->policy);
+	kmem_cache_free(sn_cache, n);
+}
+
+struct sp_node *
+sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+{
+	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+
+	if (!n)
+		return NULL;
+	n->start = start;
+	n->end = end;
+	mpol_get(pol);
+	n->policy = pol;
+	return n;
+}
+
+/* Replace a policy range. */
+static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
+				 unsigned long end, struct sp_node *new)
+{
+	struct sp_node *n, *new2 = NULL;
+
+restart:
+	spin_lock(&sp->lock);
+	n = sp_lookup(sp, start, end);
+	/* Take care of old policies in the same range. */
+	while (n && n->start < end) {
+		struct rb_node *next = rb_next(&n->nd);
+		if (n->start >= start) {
+			if (n->end <= end)
+				sp_delete(sp, n);
+			else
+				n->start = end;
+		} else {
+			/* Old policy spanning whole new range. */
+			if (n->end > end) {
+				if (!new2) {
+					spin_unlock(&sp->lock);
+					new2 = sp_alloc(end, n->end, n->policy);
+					if (!new2)
+						return -ENOMEM;
+					goto restart;
+				}
+				n->end = start;
+				sp_insert(sp, new2);
+				new2 = NULL;
+				break;
+			} else
+				n->end = start;
+		}
+		if (!next)
+			break;
+		n = rb_entry(next, struct sp_node, nd);
+	}
+	if (new)
+		sp_insert(sp, new);
+	spin_unlock(&sp->lock);
+	if (new2) {
+		mpol_free(new2->policy);
+		kmem_cache_free(sn_cache, new2);
+	}
+	return 0;
+}
+
+int mpol_set_shared_policy(struct shared_policy *info,
+			struct vm_area_struct *vma, struct mempolicy *npol)
+{
+	int err;
+	struct sp_node *new = NULL;
+	unsigned long sz = vma_pages(vma);
+
+	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+		 vma->vm_pgoff,
+		 sz, npol? npol->policy : -1,
+		npol ? npol->v.nodes[0] : -1);
+
+	if (npol) {
+		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+		if (!new)
+			return -ENOMEM;
+	}
+	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+	if (err && new)
+		kmem_cache_free(sn_cache, new);
+	return err;
+}
+
+/* Free a backing policy store on inode delete. */
+void mpol_free_shared_policy(struct shared_policy *p)
+{
+	struct sp_node *n;
+	struct rb_node *next;
+
+	if (!p->root.rb_node)
+		return;
+	spin_lock(&p->lock);
+	next = rb_first(&p->root);
+	while (next) {
+		n = rb_entry(next, struct sp_node, nd);
+		next = rb_next(&n->nd);
+		mpol_free(n->policy);
+		kmem_cache_free(sn_cache, n);
+	}
+	spin_unlock(&p->lock);
+	p->root = RB_ROOT;
+}
+
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
+{
+	policy_cache = kmem_cache_create("numa_policy",
+					 sizeof(struct mempolicy),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	sn_cache = kmem_cache_create("shared_policy_node",
+				     sizeof(struct sp_node),
+				     0, SLAB_PANIC, NULL, NULL);
+
+	/* Set interleaving policy for system init. This way not all
+	   the data structures allocated at system boot end up in node zero. */
+
+	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
+							MAX_NUMNODES) < 0)
+		printk("numa_policy_init: interleaving failed\n");
+}
+
+/* Reset policy of current process to default.
+ * Assumes fs == KERNEL_DS */
+void numa_default_policy(void)
+{
+	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+}
diff --git a/mm/mempool.c b/mm/mempool.c
new file mode 100644
index 000000000000..b014ffeaa413
--- /dev/null
+++ b/mm/mempool.c
@@ -0,0 +1,290 @@
+/*
+ *  linux/mm/mempool.c
+ *
+ *  memory buffer pool support. Such pools are mostly used
+ *  for guaranteed, deadlock-free memory allocations during
+ *  extreme VM load.
+ *
+ *  started by Ingo Molnar, Copyright (C) 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+
+static void add_element(mempool_t *pool, void *element)
+{
+	BUG_ON(pool->curr_nr >= pool->min_nr);
+	pool->elements[pool->curr_nr++] = element;
+}
+
+static void *remove_element(mempool_t *pool)
+{
+	BUG_ON(pool->curr_nr <= 0);
+	return pool->elements[--pool->curr_nr];
+}
+
+static void free_pool(mempool_t *pool)
+{
+	while (pool->curr_nr) {
+		void *element = remove_element(pool);
+		pool->free(element, pool->pool_data);
+	}
+	kfree(pool->elements);
+	kfree(pool);
+}
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc function is not called
+ * from IRQ contexts.
+ */
+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+				mempool_free_t *free_fn, void *pool_data)
+{
+	mempool_t *pool;
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+	memset(pool, 0, sizeof(*pool));
+	pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+	if (!pool->elements) {
+		kfree(pool);
+		return NULL;
+	}
+	spin_lock_init(&pool->lock);
+	pool->min_nr = min_nr;
+	pool->pool_data = pool_data;
+	init_waitqueue_head(&pool->wait);
+	pool->alloc = alloc_fn;
+	pool->free = free_fn;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers.
+	 */
+	while (pool->curr_nr < pool->min_nr) {
+		void *element;
+
+		element = pool->alloc(GFP_KERNEL, pool->pool_data);
+		if (unlikely(!element)) {
+			free_pool(pool);
+			return NULL;
+		}
+		add_element(pool, element);
+	}
+	return pool;
+}
+EXPORT_SYMBOL(mempool_create);
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool:       pointer to the memory pool which was allocated via
+ *              mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ *              allocated for this pool.
+ * @gfp_mask:   the usual allocation bitmask.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
+{
+	void *element;
+	void **new_elements;
+	unsigned long flags;
+
+	BUG_ON(new_min_nr <= 0);
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (new_min_nr <= pool->min_nr) {
+		while (new_min_nr < pool->curr_nr) {
+			element = remove_element(pool);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);
+			spin_lock_irqsave(&pool->lock, flags);
+		}
+		pool->min_nr = new_min_nr;
+		goto out_unlock;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* Grow the pool */
+	new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+	if (!new_elements)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (unlikely(new_min_nr <= pool->min_nr)) {
+		/* Raced, other resize will do our work */
+		spin_unlock_irqrestore(&pool->lock, flags);
+		kfree(new_elements);
+		goto out;
+	}
+	memcpy(new_elements, pool->elements,
+			pool->curr_nr * sizeof(*new_elements));
+	kfree(pool->elements);
+	pool->elements = new_elements;
+	pool->min_nr = new_min_nr;
+
+	while (pool->curr_nr < pool->min_nr) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (!element)
+			goto out;
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			add_element(pool, element);
+		} else {
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);	/* Raced */
+			goto out;
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&pool->lock, flags);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(mempool_resize);
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps. The caller
+ * has to guarantee that all elements have been returned to the pool (ie:
+ * freed) prior to calling mempool_destroy().
+ */
+void mempool_destroy(mempool_t *pool)
+{
+	if (pool->curr_nr != pool->min_nr)
+		BUG();		/* There were outstanding elements */
+	free_pool(pool);
+}
+EXPORT_SYMBOL(mempool_destroy);
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ * @gfp_mask:  the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ */
+void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
+{
+	void *element;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+repeat_alloc:
+	element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data);
+	if (likely(element != NULL))
+		return element;
+
+	/*
+	 * If the pool is less than 50% full and we can perform effective
+	 * page reclaim then try harder to allocate an element.
+	 */
+	mb();
+	if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
+				(pool->curr_nr <= pool->min_nr/2)) {
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (likely(element != NULL))
+			return element;
+	}
+
+	/*
+	 * Kick the VM at this point.
+	 */
+	wakeup_bdflush(0);
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (likely(pool->curr_nr)) {
+		element = remove_element(pool);
+		spin_unlock_irqrestore(&pool->lock, flags);
+		return element;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* We must not sleep in the GFP_ATOMIC case */
+	if (!(gfp_mask & __GFP_WAIT))
+		return NULL;
+
+	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+	mb();
+	if (!pool->curr_nr)
+		io_schedule();
+	finish_wait(&pool->wait, &wait);
+
+	goto repeat_alloc;
+}
+EXPORT_SYMBOL(mempool_alloc);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element:   pool element pointer.
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+	unsigned long flags;
+
+	mb();
+	if (pool->curr_nr < pool->min_nr) {
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			add_element(pool, element);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			wake_up(&pool->wait);
+			return;
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
+	pool->free(element, pool->pool_data);
+}
+EXPORT_SYMBOL(mempool_free);
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	return kmem_cache_alloc(mem, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_alloc_slab);
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	kmem_cache_free(mem, element);
+}
+EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/mincore.c b/mm/mincore.c
new file mode 100644
index 000000000000..07833dc5829d
--- /dev/null
+++ b/mm/mincore.c
@@ -0,0 +1,191 @@
+/*
+ *	linux/mm/mincore.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * The mincore() system call.
+ */
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = vma->vm_file->f_mapping;
+	struct page * page;
+
+	page = find_get_page(as, pgoff);
+	if (page) {
+		present = PageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char __user * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char __user * vec)
+{
+	int index = 0;
+	unsigned long end, limit;
+	struct vm_area_struct * vma;
+	size_t max;
+	int unmapped_error = 0;
+	long error;
+
+	/* check the arguments */
+ 	if (start & ~PAGE_CACHE_MASK)
+		goto einval;
+
+	if (start < FIRST_USER_PGD_NR * PGDIR_SIZE)
+		goto enomem;
+
+	limit = TASK_SIZE;
+	if (start >= limit)
+		goto enomem;
+
+	if (!len)
+		return 0;
+
+	max = limit - start;
+	len = PAGE_CACHE_ALIGN(len);
+	if (len > max || !len)
+		goto enomem;
+
+	end = start + len;
+
+	/* check the output buffer whilst holding the lock */
+	error = -EFAULT;
+	down_read(&current->mm->mmap_sem);
+
+	if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	error = 0;
+
+	vma = find_vma(current->mm, start);
+	while (vma) {
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+	/* we found a hole in the area queried if we arrive here */
+	error = -ENOMEM;
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+
+einval:
+	return -EINVAL;
+enomem:
+	return -ENOMEM;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
new file mode 100644
index 000000000000..4ae3a46ff768
--- /dev/null
+++ b/mm/mlock.c
@@ -0,0 +1,253 @@
+/*
+ *	linux/mm/mlock.c
+ *
+ *  (C) Copyright 1995 Linus Torvalds
+ *  (C) Copyright 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+
+
+static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+	unsigned long start, unsigned long end, unsigned int newflags)
+{
+	struct mm_struct * mm = vma->vm_mm;
+	pgoff_t pgoff;
+	int pages;
+	int ret = 0;
+
+	if (newflags == vma->vm_flags) {
+		*prev = vma;
+		goto out;
+	}
+
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+			  vma->vm_file, pgoff, vma_policy(vma));
+	if (*prev) {
+		vma = *prev;
+		goto success;
+	}
+
+	*prev = vma;
+
+	if (start != vma->vm_start) {
+		ret = split_vma(mm, vma, start, 1);
+		if (ret)
+			goto out;
+	}
+
+	if (end != vma->vm_end) {
+		ret = split_vma(mm, vma, end, 0);
+		if (ret)
+			goto out;
+	}
+
+success:
+	/*
+	 * vm_flags is protected by the mmap_sem held in write mode.
+	 * It's okay if try_to_unmap_one unmaps a page just after we
+	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 */
+	vma->vm_flags = newflags;
+
+	/*
+	 * Keep track of amount of locked VM.
+	 */
+	pages = (end - start) >> PAGE_SHIFT;
+	if (newflags & VM_LOCKED) {
+		pages = -pages;
+		if (!(newflags & VM_IO))
+			ret = make_pages_present(start, end);
+	}
+
+	vma->vm_mm->locked_vm -= pages;
+out:
+	if (ret == -ENOMEM)
+		ret = -EAGAIN;
+	return ret;
+}
+
+static int do_mlock(unsigned long start, size_t len, int on)
+{
+	unsigned long nstart, end, tmp;
+	struct vm_area_struct * vma, * prev;
+	int error;
+
+	len = PAGE_ALIGN(len);
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (!vma || vma->vm_start > start)
+		return -ENOMEM;
+
+	if (start > vma->vm_start)
+		prev = vma;
+
+	for (nstart = start ; ; ) {
+		unsigned int newflags;
+
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+
+		newflags = vma->vm_flags | VM_LOCKED;
+		if (!on)
+			newflags &= ~VM_LOCKED;
+
+		tmp = vma->vm_end;
+		if (tmp > end)
+			tmp = end;
+		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+		if (error)
+			break;
+		nstart = tmp;
+		if (nstart < prev->vm_end)
+			nstart = prev->vm_end;
+		if (nstart >= end)
+			break;
+
+		vma = prev->vm_next;
+		if (!vma || vma->vm_start != nstart) {
+			error = -ENOMEM;
+			break;
+		}
+	}
+	return error;
+}
+
+asmlinkage long sys_mlock(unsigned long start, size_t len)
+{
+	unsigned long locked;
+	unsigned long lock_limit;
+	int error = -ENOMEM;
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	down_write(&current->mm->mmap_sem);
+	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	start &= PAGE_MASK;
+
+	locked = len >> PAGE_SHIFT;
+	locked += current->mm->locked_vm;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	/* check against resource limits */
+	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+		error = do_mlock(start, len, 1);
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
+
+asmlinkage long sys_munlock(unsigned long start, size_t len)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	start &= PAGE_MASK;
+	ret = do_mlock(start, len, 0);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
+
+static int do_mlockall(int flags)
+{
+	struct vm_area_struct * vma, * prev = NULL;
+	unsigned int def_flags = 0;
+
+	if (flags & MCL_FUTURE)
+		def_flags = VM_LOCKED;
+	current->mm->def_flags = def_flags;
+	if (flags == MCL_FUTURE)
+		goto out;
+
+	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+		unsigned int newflags;
+
+		newflags = vma->vm_flags | VM_LOCKED;
+		if (!(flags & MCL_CURRENT))
+			newflags &= ~VM_LOCKED;
+
+		/* Ignore errors */
+		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+	}
+out:
+	return 0;
+}
+
+asmlinkage long sys_mlockall(int flags)
+{
+	unsigned long lock_limit;
+	int ret = -EINVAL;
+
+	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+		goto out;
+
+	ret = -EPERM;
+	if (!can_do_mlock())
+		goto out;
+
+	down_write(&current->mm->mmap_sem);
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	ret = -ENOMEM;
+	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
+	    capable(CAP_IPC_LOCK))
+		ret = do_mlockall(flags);
+	up_write(&current->mm->mmap_sem);
+out:
+	return ret;
+}
+
+asmlinkage long sys_munlockall(void)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = do_mlockall(0);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
+ * shm segments) get accounted against the user_struct instead.
+ */
+static DEFINE_SPINLOCK(shmlock_user_lock);
+
+int user_shm_lock(size_t size, struct user_struct *user)
+{
+	unsigned long lock_limit, locked;
+	int allowed = 0;
+
+	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+	spin_lock(&shmlock_user_lock);
+	if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+		goto out;
+	get_uid(user);
+	user->locked_shm += locked;
+	allowed = 1;
+out:
+	spin_unlock(&shmlock_user_lock);
+	return allowed;
+}
+
+void user_shm_unlock(size_t size, struct user_struct *user)
+{
+	spin_lock(&shmlock_user_lock);
+	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	spin_unlock(&shmlock_user_lock);
+	free_uid(user);
+}
diff --git a/mm/mmap.c b/mm/mmap.c
new file mode 100644
index 000000000000..a95ebda27446
--- /dev/null
+++ b/mm/mmap.c
@@ -0,0 +1,2082 @@
+/*
+ * mm/mmap.c
+ *
+ * Written by obz.
+ *
+ * Address space accounting code	<alan@redhat.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <linux/profile.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlb.h>
+
+/*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+ * unless you know what you are doing.
+ */
+#undef DEBUG_MM_RB
+
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware.  The expected
+ * behavior is in parens:
+ *
+ * map_type	prot
+ *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
+ * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *		
+ * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ */
+pgprot_t protection_map[16] = {
+	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
+	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
+};
+
+int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
+int sysctl_overcommit_ratio = 50;	/* default is 50% */
+int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+atomic_t vm_committed_space = ATOMIC_INIT(0);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(long pages, int cap_sys_admin)
+{
+	unsigned long free, allowed;
+
+	vm_acct_memory(pages);
+
+	/*
+	 * Sometimes we want to use more memory than we have
+	 */
+	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+		return 0;
+
+	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+		unsigned long n;
+
+		free = get_page_cache_size();
+		free += nr_swap_pages;
+
+		/*
+		 * Any slabs which are created with the
+		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+		 * which are reclaimable, under pressure.  The dentry
+		 * cache and most inode caches should fall into this
+		 */
+		free += atomic_read(&slab_reclaim_pages);
+
+		/*
+		 * Leave the last 3% for root
+		 */
+		if (!cap_sys_admin)
+			free -= free / 32;
+
+		if (free > pages)
+			return 0;
+
+		/*
+		 * nr_free_pages() is very expensive on large systems,
+		 * only call if we're about to fail.
+		 */
+		n = nr_free_pages();
+		if (!cap_sys_admin)
+			n -= n / 32;
+		free += n;
+
+		if (free > pages)
+			return 0;
+		vm_unacct_memory(pages);
+		return -ENOMEM;
+	}
+
+	allowed = (totalram_pages - hugetlb_total_pages())
+	       	* sysctl_overcommit_ratio / 100;
+	/*
+	 * Leave the last 3% for root
+	 */
+	if (!cap_sys_admin)
+		allowed -= allowed / 32;
+	allowed += total_swap_pages;
+
+	/* Don't let a single process grow too big:
+	   leave 3% of the size of this process for other processes */
+	allowed -= current->mm->total_vm / 32;
+
+	if (atomic_read(&vm_committed_space) < allowed)
+		return 0;
+
+	vm_unacct_memory(pages);
+
+	return -ENOMEM;
+}
+
+EXPORT_SYMBOL(sysctl_overcommit_memory);
+EXPORT_SYMBOL(sysctl_overcommit_ratio);
+EXPORT_SYMBOL(sysctl_max_map_count);
+EXPORT_SYMBOL(vm_committed_space);
+EXPORT_SYMBOL(__vm_enough_memory);
+
+/*
+ * Requires inode->i_mapping->i_mmap_lock
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+		struct file *file, struct address_space *mapping)
+{
+	if (vma->vm_flags & VM_DENYWRITE)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
+	if (vma->vm_flags & VM_SHARED)
+		mapping->i_mmap_writable--;
+
+	flush_dcache_mmap_lock(mapping);
+	if (unlikely(vma->vm_flags & VM_NONLINEAR))
+		list_del_init(&vma->shared.vm_set.list);
+	else
+		vma_prio_tree_remove(vma, &mapping->i_mmap);
+	flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Remove one vm structure and free it.
+ */
+static void remove_vm_struct(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+
+	might_sleep();
+	if (file) {
+		struct address_space *mapping = file->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		__remove_shared_vm_struct(vma, file, mapping);
+		spin_unlock(&mapping->i_mmap_lock);
+	}
+	if (vma->vm_ops && vma->vm_ops->close)
+		vma->vm_ops->close(vma);
+	if (file)
+		fput(file);
+	anon_vma_unlink(vma);
+	mpol_free(vma_policy(vma));
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+/*
+ *  sys_brk() for the most part doesn't need the global kernel
+ *  lock, except when an application is doing something nasty
+ *  like trying to un-brk an area that has already been mapped
+ *  to a regular file.  in this case, the unmapping will need
+ *  to invoke file system routines that need the global lock.
+ */
+asmlinkage unsigned long sys_brk(unsigned long brk)
+{
+	unsigned long rlim, retval;
+	unsigned long newbrk, oldbrk;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+
+	if (brk < mm->end_code)
+		goto out;
+	newbrk = PAGE_ALIGN(brk);
+	oldbrk = PAGE_ALIGN(mm->brk);
+	if (oldbrk == newbrk)
+		goto set_brk;
+
+	/* Always allow shrinking brk. */
+	if (brk <= mm->brk) {
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+			goto set_brk;
+		goto out;
+	}
+
+	/* Check against rlimit.. */
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+		goto out;
+
+	/* Check against existing mmap mappings. */
+	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+		goto out;
+
+	/* Ok, looks good - let it rip. */
+	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+		goto out;
+set_brk:
+	mm->brk = brk;
+out:
+	retval = mm->brk;
+	up_write(&mm->mmap_sem);
+	return retval;
+}
+
+#ifdef DEBUG_MM_RB
+static int browse_rb(struct rb_root *root)
+{
+	int i = 0, j;
+	struct rb_node *nd, *pn = NULL;
+	unsigned long prev = 0, pend = 0;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		struct vm_area_struct *vma;
+		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+		if (vma->vm_start < prev)
+			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
+		if (vma->vm_start < pend)
+			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+		if (vma->vm_start > vma->vm_end)
+			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+		i++;
+		pn = nd;
+	}
+	j = 0;
+	for (nd = pn; nd; nd = rb_prev(nd)) {
+		j++;
+	}
+	if (i != j)
+		printk("backwards %d, forwards %d\n", j, i), i = 0;
+	return i;
+}
+
+void validate_mm(struct mm_struct *mm)
+{
+	int bug = 0;
+	int i = 0;
+	struct vm_area_struct *tmp = mm->mmap;
+	while (tmp) {
+		tmp = tmp->vm_next;
+		i++;
+	}
+	if (i != mm->map_count)
+		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+	i = browse_rb(&mm->mm_rb);
+	if (i != mm->map_count)
+		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+	if (bug)
+		BUG();
+}
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+static struct vm_area_struct *
+find_vma_prepare(struct mm_struct *mm, unsigned long addr,
+		struct vm_area_struct **pprev, struct rb_node ***rb_link,
+		struct rb_node ** rb_parent)
+{
+	struct vm_area_struct * vma;
+	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+
+	__rb_link = &mm->mm_rb.rb_node;
+	rb_prev = __rb_parent = NULL;
+	vma = NULL;
+
+	while (*__rb_link) {
+		struct vm_area_struct *vma_tmp;
+
+		__rb_parent = *__rb_link;
+		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				return vma;
+			__rb_link = &__rb_parent->rb_left;
+		} else {
+			rb_prev = __rb_parent;
+			__rb_link = &__rb_parent->rb_right;
+		}
+	}
+
+	*pprev = NULL;
+	if (rb_prev)
+		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+	*rb_link = __rb_link;
+	*rb_parent = __rb_parent;
+	return vma;
+}
+
+static inline void
+__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+		struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+	if (prev) {
+		vma->vm_next = prev->vm_next;
+		prev->vm_next = vma;
+	} else {
+		mm->mmap = vma;
+		if (rb_parent)
+			vma->vm_next = rb_entry(rb_parent,
+					struct vm_area_struct, vm_rb);
+		else
+			vma->vm_next = NULL;
+	}
+}
+
+void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
+		struct rb_node **rb_link, struct rb_node *rb_parent)
+{
+	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+}
+
+static inline void __vma_link_file(struct vm_area_struct *vma)
+{
+	struct file * file;
+
+	file = vma->vm_file;
+	if (file) {
+		struct address_space *mapping = file->f_mapping;
+
+		if (vma->vm_flags & VM_DENYWRITE)
+			atomic_dec(&file->f_dentry->d_inode->i_writecount);
+		if (vma->vm_flags & VM_SHARED)
+			mapping->i_mmap_writable++;
+
+		flush_dcache_mmap_lock(mapping);
+		if (unlikely(vma->vm_flags & VM_NONLINEAR))
+			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+		else
+			vma_prio_tree_insert(vma, &mapping->i_mmap);
+		flush_dcache_mmap_unlock(mapping);
+	}
+}
+
+static void
+__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+	struct vm_area_struct *prev, struct rb_node **rb_link,
+	struct rb_node *rb_parent)
+{
+	__vma_link_list(mm, vma, prev, rb_parent);
+	__vma_link_rb(mm, vma, rb_link, rb_parent);
+	__anon_vma_link(vma);
+}
+
+static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+			struct vm_area_struct *prev, struct rb_node **rb_link,
+			struct rb_node *rb_parent)
+{
+	struct address_space *mapping = NULL;
+
+	if (vma->vm_file)
+		mapping = vma->vm_file->f_mapping;
+
+	if (mapping) {
+		spin_lock(&mapping->i_mmap_lock);
+		vma->vm_truncate_count = mapping->truncate_count;
+	}
+	anon_vma_lock(vma);
+
+	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	__vma_link_file(vma);
+
+	anon_vma_unlock(vma);
+	if (mapping)
+		spin_unlock(&mapping->i_mmap_lock);
+
+	mm->map_count++;
+	validate_mm(mm);
+}
+
+/*
+ * Helper for vma_adjust in the split_vma insert case:
+ * insert vm structure into list and rbtree and anon_vma,
+ * but it has already been inserted into prio_tree earlier.
+ */
+static void
+__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+	struct vm_area_struct * __vma, * prev;
+	struct rb_node ** rb_link, * rb_parent;
+
+	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
+	if (__vma && __vma->vm_start < vma->vm_end)
+		BUG();
+	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	mm->map_count++;
+}
+
+static inline void
+__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
+		struct vm_area_struct *prev)
+{
+	prev->vm_next = vma->vm_next;
+	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	if (mm->mmap_cache == vma)
+		mm->mmap_cache = prev;
+}
+
+/*
+ * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
+ * is already present in an i_mmap tree without adjusting the tree.
+ * The following helper function should be used when such adjustments
+ * are necessary.  The "insert" vma (if any) is to be inserted
+ * before we drop the necessary locks.
+ */
+void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *next = vma->vm_next;
+	struct vm_area_struct *importer = NULL;
+	struct address_space *mapping = NULL;
+	struct prio_tree_root *root = NULL;
+	struct file *file = vma->vm_file;
+	struct anon_vma *anon_vma = NULL;
+	long adjust_next = 0;
+	int remove_next = 0;
+
+	if (next && !insert) {
+		if (end >= next->vm_end) {
+			/*
+			 * vma expands, overlapping all the next, and
+			 * perhaps the one after too (mprotect case 6).
+			 */
+again:			remove_next = 1 + (end > next->vm_end);
+			end = next->vm_end;
+			anon_vma = next->anon_vma;
+			importer = vma;
+		} else if (end > next->vm_start) {
+			/*
+			 * vma expands, overlapping part of the next:
+			 * mprotect case 5 shifting the boundary up.
+			 */
+			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+			anon_vma = next->anon_vma;
+			importer = vma;
+		} else if (end < vma->vm_end) {
+			/*
+			 * vma shrinks, and !insert tells it's not
+			 * split_vma inserting another: so it must be
+			 * mprotect case 4 shifting the boundary down.
+			 */
+			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+			anon_vma = next->anon_vma;
+			importer = next;
+		}
+	}
+
+	if (file) {
+		mapping = file->f_mapping;
+		if (!(vma->vm_flags & VM_NONLINEAR))
+			root = &mapping->i_mmap;
+		spin_lock(&mapping->i_mmap_lock);
+		if (importer &&
+		    vma->vm_truncate_count != next->vm_truncate_count) {
+			/*
+			 * unmap_mapping_range might be in progress:
+			 * ensure that the expanding vma is rescanned.
+			 */
+			importer->vm_truncate_count = 0;
+		}
+		if (insert) {
+			insert->vm_truncate_count = vma->vm_truncate_count;
+			/*
+			 * Put into prio_tree now, so instantiated pages
+			 * are visible to arm/parisc __flush_dcache_page
+			 * throughout; but we cannot insert into address
+			 * space until vma start or end is updated.
+			 */
+			__vma_link_file(insert);
+		}
+	}
+
+	/*
+	 * When changing only vma->vm_end, we don't really need
+	 * anon_vma lock: but is that case worth optimizing out?
+	 */
+	if (vma->anon_vma)
+		anon_vma = vma->anon_vma;
+	if (anon_vma) {
+		spin_lock(&anon_vma->lock);
+		/*
+		 * Easily overlooked: when mprotect shifts the boundary,
+		 * make sure the expanding vma has anon_vma set if the
+		 * shrinking vma had, to cover any anon pages imported.
+		 */
+		if (importer && !importer->anon_vma) {
+			importer->anon_vma = anon_vma;
+			__anon_vma_link(importer);
+		}
+	}
+
+	if (root) {
+		flush_dcache_mmap_lock(mapping);
+		vma_prio_tree_remove(vma, root);
+		if (adjust_next)
+			vma_prio_tree_remove(next, root);
+	}
+
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+	if (adjust_next) {
+		next->vm_start += adjust_next << PAGE_SHIFT;
+		next->vm_pgoff += adjust_next;
+	}
+
+	if (root) {
+		if (adjust_next)
+			vma_prio_tree_insert(next, root);
+		vma_prio_tree_insert(vma, root);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	if (remove_next) {
+		/*
+		 * vma_merge has merged next into vma, and needs
+		 * us to remove next before dropping the locks.
+		 */
+		__vma_unlink(mm, next, vma);
+		if (file)
+			__remove_shared_vm_struct(next, file, mapping);
+		if (next->anon_vma)
+			__anon_vma_merge(vma, next);
+	} else if (insert) {
+		/*
+		 * split_vma has split insert from vma, and needs
+		 * us to insert it before dropping the locks
+		 * (it may either follow vma or precede it).
+		 */
+		__insert_vm_struct(mm, insert);
+	}
+
+	if (anon_vma)
+		spin_unlock(&anon_vma->lock);
+	if (mapping)
+		spin_unlock(&mapping->i_mmap_lock);
+
+	if (remove_next) {
+		if (file)
+			fput(file);
+		mm->map_count--;
+		mpol_free(vma_policy(next));
+		kmem_cache_free(vm_area_cachep, next);
+		/*
+		 * In mprotect's case 6 (see comments on vma_merge),
+		 * we must remove another next too. It would clutter
+		 * up the code too much to do both in one go.
+		 */
+		if (remove_next == 2) {
+			next = vma->vm_next;
+			goto again;
+		}
+	}
+
+	validate_mm(mm);
+}
+
+/*
+ * If the vma has a ->close operation then the driver probably needs to release
+ * per-vma resources, so we don't attempt to merge those.
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
+
+static inline int is_mergeable_vma(struct vm_area_struct *vma,
+			struct file *file, unsigned long vm_flags)
+{
+	if (vma->vm_flags != vm_flags)
+		return 0;
+	if (vma->vm_file != file)
+		return 0;
+	if (vma->vm_ops && vma->vm_ops->close)
+		return 0;
+	return 1;
+}
+
+static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+					struct anon_vma *anon_vma2)
+{
+	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * in front of (at a lower virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We don't check here for the merged mmap wrapping around the end of pagecache
+ * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * wrap, nor mmaps which cover the final page at index -1UL.
+ */
+static int
+can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+{
+	if (is_mergeable_vma(vma, file, vm_flags) &&
+	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+		if (vma->vm_pgoff == vm_pgoff)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * beyond (at a higher virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ */
+static int
+can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+{
+	if (is_mergeable_vma(vma, file, vm_flags) &&
+	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+		pgoff_t vm_pglen;
+		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
+ * whether that can be merged with its predecessor or its successor.
+ * Or both (it neatly fills a hole).
+ *
+ * In most cases - when called for mmap, brk or mremap - [addr,end) is
+ * certain not to be mapped by the time vma_merge is called; but when
+ * called for mprotect, it is certain to be already mapped (either at
+ * an offset within prev, or at the start of next), and the flags of
+ * this area are about to be changed to vm_flags - and the no-change
+ * case has already been eliminated.
+ *
+ * The following mprotect cases have to be considered, where AAAA is
+ * the area passed down from mprotect_fixup, never extending beyond one
+ * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ *
+ *     AAAA             AAAA                AAAA          AAAA
+ *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
+ *    cannot merge    might become    might become    might become
+ *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
+ *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
+ *    mremap move:                                    PPPPNNNNNNNN 8
+ *        AAAA
+ *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
+ *    might become    case 1 below    case 2 below    case 3 below
+ *
+ * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
+ * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
+ */
+struct vm_area_struct *vma_merge(struct mm_struct *mm,
+			struct vm_area_struct *prev, unsigned long addr,
+			unsigned long end, unsigned long vm_flags,
+		     	struct anon_vma *anon_vma, struct file *file,
+			pgoff_t pgoff, struct mempolicy *policy)
+{
+	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+	struct vm_area_struct *area, *next;
+
+	/*
+	 * We later require that vma->vm_flags == vm_flags,
+	 * so this tests vma->vm_flags & VM_SPECIAL, too.
+	 */
+	if (vm_flags & VM_SPECIAL)
+		return NULL;
+
+	if (prev)
+		next = prev->vm_next;
+	else
+		next = mm->mmap;
+	area = next;
+	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
+		next = next->vm_next;
+
+	/*
+	 * Can it merge with the predecessor?
+	 */
+	if (prev && prev->vm_end == addr &&
+  			mpol_equal(vma_policy(prev), policy) &&
+			can_vma_merge_after(prev, vm_flags,
+						anon_vma, file, pgoff)) {
+		/*
+		 * OK, it can.  Can we now merge in the successor as well?
+		 */
+		if (next && end == next->vm_start &&
+				mpol_equal(policy, vma_policy(next)) &&
+				can_vma_merge_before(next, vm_flags,
+					anon_vma, file, pgoff+pglen) &&
+				is_mergeable_anon_vma(prev->anon_vma,
+						      next->anon_vma)) {
+							/* cases 1, 6 */
+			vma_adjust(prev, prev->vm_start,
+				next->vm_end, prev->vm_pgoff, NULL);
+		} else					/* cases 2, 5, 7 */
+			vma_adjust(prev, prev->vm_start,
+				end, prev->vm_pgoff, NULL);
+		return prev;
+	}
+
+	/*
+	 * Can this new request be merged in front of next?
+	 */
+	if (next && end == next->vm_start &&
+ 			mpol_equal(policy, vma_policy(next)) &&
+			can_vma_merge_before(next, vm_flags,
+					anon_vma, file, pgoff+pglen)) {
+		if (prev && addr < prev->vm_end)	/* case 4 */
+			vma_adjust(prev, prev->vm_start,
+				addr, prev->vm_pgoff, NULL);
+		else					/* cases 3, 8 */
+			vma_adjust(area, addr, next->vm_end,
+				next->vm_pgoff - pglen, NULL);
+		return area;
+	}
+
+	return NULL;
+}
+
+/*
+ * find_mergeable_anon_vma is used by anon_vma_prepare, to check
+ * neighbouring vmas for a suitable anon_vma, before it goes off
+ * to allocate a new anon_vma.  It checks because a repetitive
+ * sequence of mprotects and faults may otherwise lead to distinct
+ * anon_vmas being allocated, preventing vma merge in subsequent
+ * mprotect.
+ */
+struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
+{
+	struct vm_area_struct *near;
+	unsigned long vm_flags;
+
+	near = vma->vm_next;
+	if (!near)
+		goto try_prev;
+
+	/*
+	 * Since only mprotect tries to remerge vmas, match flags
+	 * which might be mprotected into each other later on.
+	 * Neither mlock nor madvise tries to remerge at present,
+	 * so leave their flags as obstructing a merge.
+	 */
+	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+
+	if (near->anon_vma && vma->vm_end == near->vm_start &&
+ 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
+			can_vma_merge_before(near, vm_flags,
+				NULL, vma->vm_file, vma->vm_pgoff +
+				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
+		return near->anon_vma;
+try_prev:
+	/*
+	 * It is potentially slow to have to call find_vma_prev here.
+	 * But it's only on the first write fault on the vma, not
+	 * every time, and we could devise a way to avoid it later
+	 * (e.g. stash info in next's anon_vma_node when assigning
+	 * an anon_vma, or when trying vma_merge).  Another time.
+	 */
+	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
+		BUG();
+	if (!near)
+		goto none;
+
+	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+
+	if (near->anon_vma && near->vm_end == vma->vm_start &&
+  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
+			can_vma_merge_after(near, vm_flags,
+				NULL, vma->vm_file, vma->vm_pgoff))
+		return near->anon_vma;
+none:
+	/*
+	 * There's no absolute need to look only at touching neighbours:
+	 * we could search further afield for "compatible" anon_vmas.
+	 * But it would probably just be a waste of time searching,
+	 * or lead to too many vmas hanging off the same anon_vma.
+	 * We're trying to allow mprotect remerging later on,
+	 * not trying to minimize memory used for anon_vmas.
+	 */
+	return NULL;
+}
+
+#ifdef CONFIG_PROC_FS
+void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+						struct file *file, long pages)
+{
+	const unsigned long stack_flags
+		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+
+#ifdef CONFIG_HUGETLB
+	if (flags & VM_HUGETLB) {
+		if (!(flags & VM_DONTCOPY))
+			mm->shared_vm += pages;
+		return;
+	}
+#endif /* CONFIG_HUGETLB */
+
+	if (file) {
+		mm->shared_vm += pages;
+		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
+			mm->exec_vm += pages;
+	} else if (flags & stack_flags)
+		mm->stack_vm += pages;
+	if (flags & (VM_RESERVED|VM_IO))
+		mm->reserved_vm += pages;
+}
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * The caller must hold down_write(current->mm->mmap_sem).
+ */
+
+unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+			unsigned long len, unsigned long prot,
+			unsigned long flags, unsigned long pgoff)
+{
+	struct mm_struct * mm = current->mm;
+	struct vm_area_struct * vma, * prev;
+	struct inode *inode;
+	unsigned int vm_flags;
+	int correct_wcount = 0;
+	int error;
+	struct rb_node ** rb_link, * rb_parent;
+	int accountable = 1;
+	unsigned long charged = 0, reqprot = prot;
+
+	if (file) {
+		if (is_file_hugepages(file))
+			accountable = 0;
+
+		if (!file->f_op || !file->f_op->mmap)
+			return -ENODEV;
+
+		if ((prot & PROT_EXEC) &&
+		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
+			return -EPERM;
+	}
+	/*
+	 * Does the application expect PROT_READ to imply PROT_EXEC?
+	 *
+	 * (the exception is when the underlying filesystem is noexec
+	 *  mounted, in which case we dont add PROT_EXEC.)
+	 */
+	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+		if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+			prot |= PROT_EXEC;
+
+	if (!len)
+		return -EINVAL;
+
+	/* Careful about overflows.. */
+	len = PAGE_ALIGN(len);
+	if (!len || len > TASK_SIZE)
+		return -ENOMEM;
+
+	/* offset overflow? */
+	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+               return -EOVERFLOW;
+
+	/* Too many mappings? */
+	if (mm->map_count > sysctl_max_map_count)
+		return -ENOMEM;
+
+	/* Obtain the address to map to. we verify (or select) it and ensure
+	 * that it represents a valid section of the address space.
+	 */
+	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	if (addr & ~PAGE_MASK)
+		return addr;
+
+	/* Do simple checking here so the lower-level routines won't have
+	 * to. we assume access permissions have been handled by the open
+	 * of the memory object, so we don't do any here.
+	 */
+	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+	if (flags & MAP_LOCKED) {
+		if (!can_do_mlock())
+			return -EPERM;
+		vm_flags |= VM_LOCKED;
+	}
+	/* mlock MCL_FUTURE? */
+	if (vm_flags & VM_LOCKED) {
+		unsigned long locked, lock_limit;
+		locked = mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		locked += len;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			return -EAGAIN;
+	}
+
+	inode = file ? file->f_dentry->d_inode : NULL;
+
+	if (file) {
+		switch (flags & MAP_TYPE) {
+		case MAP_SHARED:
+			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
+				return -EACCES;
+
+			/*
+			 * Make sure we don't allow writing to an append-only
+			 * file..
+			 */
+			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+				return -EACCES;
+
+			/*
+			 * Make sure there are no mandatory locks on the file.
+			 */
+			if (locks_verify_locked(inode))
+				return -EAGAIN;
+
+			vm_flags |= VM_SHARED | VM_MAYSHARE;
+			if (!(file->f_mode & FMODE_WRITE))
+				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+			/* fall through */
+		case MAP_PRIVATE:
+			if (!(file->f_mode & FMODE_READ))
+				return -EACCES;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	} else {
+		switch (flags & MAP_TYPE) {
+		case MAP_SHARED:
+			vm_flags |= VM_SHARED | VM_MAYSHARE;
+			break;
+		case MAP_PRIVATE:
+			/*
+			 * Set pgoff according to addr for anon_vma.
+			 */
+			pgoff = addr >> PAGE_SHIFT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	error = security_file_mmap(file, reqprot, prot, flags);
+	if (error)
+		return error;
+		
+	/* Clear old maps */
+	error = -ENOMEM;
+munmap_back:
+	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (vma && vma->vm_start < addr + len) {
+		if (do_munmap(mm, addr, len))
+			return -ENOMEM;
+		goto munmap_back;
+	}
+
+	/* Check against address space limit. */
+	if ((mm->total_vm << PAGE_SHIFT) + len
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
+		return -ENOMEM;
+
+	if (accountable && (!(flags & MAP_NORESERVE) ||
+			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+		if (vm_flags & VM_SHARED) {
+			/* Check memory availability in shmem_file_setup? */
+			vm_flags |= VM_ACCOUNT;
+		} else if (vm_flags & VM_WRITE) {
+			/*
+			 * Private writable mapping: check memory availability
+			 */
+			charged = len >> PAGE_SHIFT;
+			if (security_vm_enough_memory(charged))
+				return -ENOMEM;
+			vm_flags |= VM_ACCOUNT;
+		}
+	}
+
+	/*
+	 * Can we just expand an old private anonymous mapping?
+	 * The VM_SHARED test is necessary because shmem_zero_setup
+	 * will create the file object for a shared anonymous map below.
+	 */
+	if (!file && !(vm_flags & VM_SHARED) &&
+	    vma_merge(mm, prev, addr, addr + len, vm_flags,
+					NULL, NULL, pgoff, NULL))
+		goto out;
+
+	/*
+	 * Determine the object being mapped and call the appropriate
+	 * specific mapper. the address has already been validated, but
+	 * not unmapped, but the maps are removed from the list.
+	 */
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma) {
+		error = -ENOMEM;
+		goto unacct_error;
+	}
+	memset(vma, 0, sizeof(*vma));
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr + len;
+	vma->vm_flags = vm_flags;
+	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_pgoff = pgoff;
+
+	if (file) {
+		error = -EINVAL;
+		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+			goto free_vma;
+		if (vm_flags & VM_DENYWRITE) {
+			error = deny_write_access(file);
+			if (error)
+				goto free_vma;
+			correct_wcount = 1;
+		}
+		vma->vm_file = file;
+		get_file(file);
+		error = file->f_op->mmap(file, vma);
+		if (error)
+			goto unmap_and_free_vma;
+	} else if (vm_flags & VM_SHARED) {
+		error = shmem_zero_setup(vma);
+		if (error)
+			goto free_vma;
+	}
+
+	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
+	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
+	 * that memory reservation must be checked; but that reservation
+	 * belongs to shared memory object, not to vma: so now clear it.
+	 */
+	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
+		vma->vm_flags &= ~VM_ACCOUNT;
+
+	/* Can addr have changed??
+	 *
+	 * Answer: Yes, several device drivers can do it in their
+	 *         f_op->mmap method. -DaveM
+	 */
+	addr = vma->vm_start;
+	pgoff = vma->vm_pgoff;
+	vm_flags = vma->vm_flags;
+
+	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+		file = vma->vm_file;
+		vma_link(mm, vma, prev, rb_link, rb_parent);
+		if (correct_wcount)
+			atomic_inc(&inode->i_writecount);
+	} else {
+		if (file) {
+			if (correct_wcount)
+				atomic_inc(&inode->i_writecount);
+			fput(file);
+		}
+		mpol_free(vma_policy(vma));
+		kmem_cache_free(vm_area_cachep, vma);
+	}
+out:	
+	mm->total_vm += len >> PAGE_SHIFT;
+	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += len >> PAGE_SHIFT;
+		make_pages_present(addr, addr + len);
+	}
+	if (flags & MAP_POPULATE) {
+		up_write(&mm->mmap_sem);
+		sys_remap_file_pages(addr, len, 0,
+					pgoff, flags & MAP_NONBLOCK);
+		down_write(&mm->mmap_sem);
+	}
+	return addr;
+
+unmap_and_free_vma:
+	if (correct_wcount)
+		atomic_inc(&inode->i_writecount);
+	vma->vm_file = NULL;
+	fput(file);
+
+	/* Undo any partial mapping done by a device driver. */
+	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+free_vma:
+	kmem_cache_free(vm_area_cachep, vma);
+unacct_error:
+	if (charged)
+		vm_unacct_memory(charged);
+	return error;
+}
+
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+/* Get an address range which is currently unmapped.
+ * For shmat() with addr=0.
+ *
+ * Ugly calling convention alert:
+ * Return value with the low bits set means error value,
+ * ie
+ *	if (ret & ~PAGE_MASK)
+ *		error = ret;
+ *
+ * This function "knows" that -ENOMEM has the bits set.
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	start_addr = addr = mm->free_area_cache;
+
+full_search:
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = addr = TASK_UNMAPPED_BASE;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		addr = vma->vm_end;
+	}
+}
+#endif	
+
+void arch_unmap_area(struct vm_area_struct *area)
+{
+	/*
+	 * Is this a new hole at the lowest possible address?
+	 */
+	if (area->vm_start >= TASK_UNMAPPED_BASE &&
+			area->vm_start < area->vm_mm->free_area_cache)
+		area->vm_mm->free_area_cache = area->vm_start;
+}
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+
+	/* requested length too big for entire address space */
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	/* requesting a specific address */
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+				(!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+
+	/* make sure it can fit in the remaining address space */
+	if (addr >= len) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start)
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+	}
+
+	addr = mm->mmap_base-len;
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (!vma || addr+len <= vma->vm_start)
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+	} while (len <= vma->vm_start);
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+
+	return addr;
+}
+#endif
+
+void arch_unmap_area_topdown(struct vm_area_struct *area)
+{
+	/*
+	 * Is this a new hole at the highest possible address?
+	 */
+	if (area->vm_end > area->vm_mm->free_area_cache)
+		area->vm_mm->free_area_cache = area->vm_end;
+
+	/* dont allow allocations above current base */
+	if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
+		area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+}
+
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	if (flags & MAP_FIXED) {
+		unsigned long ret;
+
+		if (addr > TASK_SIZE - len)
+			return -ENOMEM;
+		if (addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (file && is_file_hugepages(file))  {
+			/*
+			 * Check if the given range is hugepage aligned, and
+			 * can be made suitable for hugepages.
+			 */
+			ret = prepare_hugepage_range(addr, len);
+		} else {
+			/*
+			 * Ensure that a normal request is not falling in a
+			 * reserved hugepage range.  For some archs like IA-64,
+			 * there is a separate region for hugepages.
+			 */
+			ret = is_hugepage_only_range(current->mm, addr, len);
+		}
+		if (ret)
+			return -EINVAL;
+		return addr;
+	}
+
+	if (file && file->f_op && file->f_op->get_unmapped_area)
+		return file->f_op->get_unmapped_area(file, addr, len,
+						pgoff, flags);
+
+	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+EXPORT_SYMBOL(get_unmapped_area);
+
+/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+
+	if (mm) {
+		/* Check the cache first. */
+		/* (Cache hit rate is typically around 35%.) */
+		vma = mm->mmap_cache;
+		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+			struct rb_node * rb_node;
+
+			rb_node = mm->mm_rb.rb_node;
+			vma = NULL;
+
+			while (rb_node) {
+				struct vm_area_struct * vma_tmp;
+
+				vma_tmp = rb_entry(rb_node,
+						struct vm_area_struct, vm_rb);
+
+				if (vma_tmp->vm_end > addr) {
+					vma = vma_tmp;
+					if (vma_tmp->vm_start <= addr)
+						break;
+					rb_node = rb_node->rb_left;
+				} else
+					rb_node = rb_node->rb_right;
+			}
+			if (vma)
+				mm->mmap_cache = vma;
+		}
+	}
+	return vma;
+}
+
+EXPORT_SYMBOL(find_vma);
+
+/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+			struct vm_area_struct **pprev)
+{
+	struct vm_area_struct *vma = NULL, *prev = NULL;
+	struct rb_node * rb_node;
+	if (!mm)
+		goto out;
+
+	/* Guard against addr being lower than the first VMA */
+	vma = mm->mmap;
+
+	/* Go through the RB tree quickly. */
+	rb_node = mm->mm_rb.rb_node;
+
+	while (rb_node) {
+		struct vm_area_struct *vma_tmp;
+		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+		if (addr < vma_tmp->vm_end) {
+			rb_node = rb_node->rb_left;
+		} else {
+			prev = vma_tmp;
+			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
+				break;
+			rb_node = rb_node->rb_right;
+		}
+	}
+
+out:
+	*pprev = prev;
+	return prev ? prev->vm_next : vma;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct rlimit *rlim = current->signal->rlim;
+
+	/* address space limit tests */
+	if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
+		return -ENOMEM;
+
+	/* Stack limit test */
+	if (size > rlim[RLIMIT_STACK].rlim_cur)
+		return -ENOMEM;
+
+	/* mlock limit tests */
+	if (vma->vm_flags & VM_LOCKED) {
+		unsigned long locked;
+		unsigned long limit;
+		locked = mm->locked_vm + grow;
+		limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+		if (locked > limit && !capable(CAP_IPC_LOCK))
+			return -ENOMEM;
+	}
+
+	/*
+	 * Overcommit..  This must be the final test, as it will
+	 * update security statistics.
+	 */
+	if (security_vm_enough_memory(grow))
+		return -ENOMEM;
+
+	/* Ok, everything looks good - let it rip */
+	mm->total_vm += grow;
+	if (vma->vm_flags & VM_LOCKED)
+		mm->locked_vm += grow;
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+	return 0;
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+/*
+ * vma is the first one with address > vma->vm_end.  Have to extend vma.
+ */
+int expand_stack(struct vm_area_struct * vma, unsigned long address)
+{
+	int error;
+
+	if (!(vma->vm_flags & VM_GROWSUP))
+		return -EFAULT;
+
+	/*
+	 * We must make sure the anon_vma is allocated
+	 * so that the anon_vma locking is not a noop.
+	 */
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+	anon_vma_lock(vma);
+
+	/*
+	 * vma->vm_start/vm_end cannot change under us because the caller
+	 * is required to hold the mmap_sem in read mode.  We need the
+	 * anon_vma lock to serialize against concurrent expand_stacks.
+	 */
+	address += 4 + PAGE_SIZE - 1;
+	address &= PAGE_MASK;
+	error = 0;
+
+	/* Somebody else might have raced and expanded it already */
+	if (address > vma->vm_end) {
+		unsigned long size, grow;
+
+		size = address - vma->vm_start;
+		grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+		error = acct_stack_growth(vma, size, grow);
+		if (!error)
+			vma->vm_end = address;
+	}
+	anon_vma_unlock(vma);
+	return error;
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma, *prev;
+
+	addr &= PAGE_MASK;
+	vma = find_vma_prev(mm, addr, &prev);
+	if (vma && (vma->vm_start <= addr))
+		return vma;
+	if (!prev || expand_stack(prev, addr))
+		return NULL;
+	if (prev->vm_flags & VM_LOCKED) {
+		make_pages_present(addr, prev->vm_end);
+	}
+	return prev;
+}
+#else
+/*
+ * vma is the first one with address < vma->vm_start.  Have to extend vma.
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	int error;
+
+	/*
+	 * We must make sure the anon_vma is allocated
+	 * so that the anon_vma locking is not a noop.
+	 */
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+	anon_vma_lock(vma);
+
+	/*
+	 * vma->vm_start/vm_end cannot change under us because the caller
+	 * is required to hold the mmap_sem in read mode.  We need the
+	 * anon_vma lock to serialize against concurrent expand_stacks.
+	 */
+	address &= PAGE_MASK;
+	error = 0;
+
+	/* Somebody else might have raced and expanded it already */
+	if (address < vma->vm_start) {
+		unsigned long size, grow;
+
+		size = vma->vm_end - address;
+		grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+		error = acct_stack_growth(vma, size, grow);
+		if (!error) {
+			vma->vm_start = address;
+			vma->vm_pgoff -= grow;
+		}
+	}
+	anon_vma_unlock(vma);
+	return error;
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct * vma;
+	unsigned long start;
+
+	addr &= PAGE_MASK;
+	vma = find_vma(mm,addr);
+	if (!vma)
+		return NULL;
+	if (vma->vm_start <= addr)
+		return vma;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		return NULL;
+	start = vma->vm_start;
+	if (expand_stack(vma, addr))
+		return NULL;
+	if (vma->vm_flags & VM_LOCKED) {
+		make_pages_present(addr, start);
+	}
+	return vma;
+}
+#endif
+
+/*
+ * Try to free as many page directory entries as we can,
+ * without having to work very hard at actually scanning
+ * the page tables themselves.
+ *
+ * Right now we try to free page tables if we have a nice
+ * PGDIR-aligned area that got free'd up. We could be more
+ * granular if we want to, but this is fast and simple,
+ * and covers the bad cases.
+ *
+ * "prev", if it exists, points to a vma before the one
+ * we just free'd - but there's no telling how much before.
+ */
+static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+	unsigned long start, unsigned long end)
+{
+	unsigned long first = start & PGDIR_MASK;
+	unsigned long last = end + PGDIR_SIZE - 1;
+	struct mm_struct *mm = tlb->mm;
+
+	if (last > MM_VM_SIZE(mm) || last < end)
+		last = MM_VM_SIZE(mm);
+
+	if (!prev) {
+		prev = mm->mmap;
+		if (!prev)
+			goto no_mmaps;
+		if (prev->vm_end > start) {
+			if (last > prev->vm_start)
+				last = prev->vm_start;
+			goto no_mmaps;
+		}
+	}
+	for (;;) {
+		struct vm_area_struct *next = prev->vm_next;
+
+		if (next) {
+			if (next->vm_start < start) {
+				prev = next;
+				continue;
+			}
+			if (last > next->vm_start)
+				last = next->vm_start;
+		}
+		if (prev->vm_end > first)
+			first = prev->vm_end;
+		break;
+	}
+no_mmaps:
+	if (last < first)	/* for arches with discontiguous pgd indices */
+		return;
+	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
+		first = FIRST_USER_PGD_NR * PGDIR_SIZE;
+	/* No point trying to free anything if we're in the same pte page */
+	if ((first & PMD_MASK) < (last & PMD_MASK)) {
+		clear_page_range(tlb, first, last);
+		flush_tlb_pgtables(mm, first, last);
+	}
+}
+
+/* Normal function to fix up a mapping
+ * This function is the default for when an area has no specific
+ * function.  This may be used as part of a more specific routine.
+ *
+ * By the time this function is called, the area struct has been
+ * removed from the process mapping list.
+ */
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+{
+	size_t len = area->vm_end - area->vm_start;
+
+	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
+	if (area->vm_flags & VM_LOCKED)
+		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
+	vm_stat_unaccount(area);
+	area->vm_mm->unmap_area(area);
+	remove_vm_struct(area);
+}
+
+/*
+ * Update the VMA and inode share lists.
+ *
+ * Ok - we have the memory areas we should free on the 'free' list,
+ * so release them, and do the vma updates.
+ */
+static void unmap_vma_list(struct mm_struct *mm,
+	struct vm_area_struct *mpnt)
+{
+	do {
+		struct vm_area_struct *next = mpnt->vm_next;
+		unmap_vma(mm, mpnt);
+		mpnt = next;
+	} while (mpnt != NULL);
+	validate_mm(mm);
+}
+
+/*
+ * Get rid of page table information in the indicated region.
+ *
+ * Called with the page table lock held.
+ */
+static void unmap_region(struct mm_struct *mm,
+	struct vm_area_struct *vma,
+	struct vm_area_struct *prev,
+	unsigned long start,
+	unsigned long end)
+{
+	struct mmu_gather *tlb;
+	unsigned long nr_accounted = 0;
+
+	lru_add_drain();
+	tlb = tlb_gather_mmu(mm, 0);
+	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
+	vm_unacct_memory(nr_accounted);
+
+	if (is_hugepage_only_range(mm, start, end - start))
+		hugetlb_free_pgtables(tlb, prev, start, end);
+	else
+		free_pgtables(tlb, prev, start, end);
+	tlb_finish_mmu(tlb, start, end);
+}
+
+/*
+ * Create a list of vma's touched by the unmap, removing them from the mm's
+ * vma list as we go..
+ */
+static void
+detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+	struct vm_area_struct *prev, unsigned long end)
+{
+	struct vm_area_struct **insertion_point;
+	struct vm_area_struct *tail_vma = NULL;
+
+	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+	do {
+		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		mm->map_count--;
+		tail_vma = vma;
+		vma = vma->vm_next;
+	} while (vma && vma->vm_start < end);
+	*insertion_point = vma;
+	tail_vma->vm_next = NULL;
+	mm->mmap_cache = NULL;		/* Kill the cache. */
+}
+
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the the tail.
+ */
+int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+	      unsigned long addr, int new_below)
+{
+	struct mempolicy *pol;
+	struct vm_area_struct *new;
+
+	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+		return -EINVAL;
+
+	if (mm->map_count >= sysctl_max_map_count)
+		return -ENOMEM;
+
+	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* most fields are the same, copy all, and then fixup */
+	*new = *vma;
+
+	if (new_below)
+		new->vm_end = addr;
+	else {
+		new->vm_start = addr;
+		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+	}
+
+	pol = mpol_copy(vma_policy(vma));
+	if (IS_ERR(pol)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return PTR_ERR(pol);
+	}
+	vma_set_policy(new, pol);
+
+	if (new->vm_file)
+		get_file(new->vm_file);
+
+	if (new->vm_ops && new->vm_ops->open)
+		new->vm_ops->open(new);
+
+	if (new_below)
+		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+			((addr - new->vm_start) >> PAGE_SHIFT), new);
+	else
+		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
+	return 0;
+}
+
+/* Munmap is split into 2 main parts -- this part which finds
+ * what needs doing, and the areas themselves, which do the
+ * work.  This now handles partial unmappings.
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+	unsigned long end;
+	struct vm_area_struct *mpnt, *prev, *last;
+
+	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+		return -EINVAL;
+
+	if ((len = PAGE_ALIGN(len)) == 0)
+		return -EINVAL;
+
+	/* Find the first overlapping VMA */
+	mpnt = find_vma_prev(mm, start, &prev);
+	if (!mpnt)
+		return 0;
+	/* we have  start < mpnt->vm_end  */
+
+	/* if it doesn't overlap, we have nothing.. */
+	end = start + len;
+	if (mpnt->vm_start >= end)
+		return 0;
+
+	/*
+	 * If we need to split any vma, do it now to save pain later.
+	 *
+	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
+	 * unmapped vm_area_struct will remain in use: so lower split_vma
+	 * places tmp vma above, and higher split_vma places tmp vma below.
+	 */
+	if (start > mpnt->vm_start) {
+		int error = split_vma(mm, mpnt, start, 0);
+		if (error)
+			return error;
+		prev = mpnt;
+	}
+
+	/* Does it split the last one? */
+	last = find_vma(mm, end);
+	if (last && end > last->vm_start) {
+		int error = split_vma(mm, last, end, 1);
+		if (error)
+			return error;
+	}
+	mpnt = prev? prev->vm_next: mm->mmap;
+
+	/*
+	 * Remove the vma's, and unmap the actual pages
+	 */
+	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
+	spin_lock(&mm->page_table_lock);
+	unmap_region(mm, mpnt, prev, start, end);
+	spin_unlock(&mm->page_table_lock);
+
+	/* Fix up all other VM information */
+	unmap_vma_list(mm, mpnt);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_munmap);
+
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+
+	profile_munmap(addr);
+
+	down_write(&mm->mmap_sem);
+	ret = do_munmap(mm, addr, len);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+static inline void verify_mm_writelocked(struct mm_struct *mm)
+{
+#ifdef CONFIG_DEBUG_KERNEL
+	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+	}
+#endif
+}
+
+/*
+ *  this is really a simplified "do_mmap".  it only handles
+ *  anonymous maps.  eventually we may be able to do some
+ *  brk-specific accounting here.
+ */
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+	struct mm_struct * mm = current->mm;
+	struct vm_area_struct * vma, * prev;
+	unsigned long flags;
+	struct rb_node ** rb_link, * rb_parent;
+	pgoff_t pgoff = addr >> PAGE_SHIFT;
+
+	len = PAGE_ALIGN(len);
+	if (!len)
+		return addr;
+
+	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
+		return -EINVAL;
+
+	/*
+	 * mlock MCL_FUTURE?
+	 */
+	if (mm->def_flags & VM_LOCKED) {
+		unsigned long locked, lock_limit;
+		locked = mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		locked += len;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			return -EAGAIN;
+	}
+
+	/*
+	 * mm->mmap_sem is required to protect against another thread
+	 * changing the mappings in case we sleep.
+	 */
+	verify_mm_writelocked(mm);
+
+	/*
+	 * Clear old maps.  this also does some error checking for us
+	 */
+ munmap_back:
+	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (vma && vma->vm_start < addr + len) {
+		if (do_munmap(mm, addr, len))
+			return -ENOMEM;
+		goto munmap_back;
+	}
+
+	/* Check against address space limits *after* clearing old maps... */
+	if ((mm->total_vm << PAGE_SHIFT) + len
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
+		return -ENOMEM;
+
+	if (mm->map_count > sysctl_max_map_count)
+		return -ENOMEM;
+
+	if (security_vm_enough_memory(len >> PAGE_SHIFT))
+		return -ENOMEM;
+
+	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+	/* Can we just expand an old private anonymous mapping? */
+	if (vma_merge(mm, prev, addr, addr + len, flags,
+					NULL, NULL, pgoff, NULL))
+		goto out;
+
+	/*
+	 * create a vma struct for an anonymous mapping
+	 */
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma) {
+		vm_unacct_memory(len >> PAGE_SHIFT);
+		return -ENOMEM;
+	}
+	memset(vma, 0, sizeof(*vma));
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr + len;
+	vma->vm_pgoff = pgoff;
+	vma->vm_flags = flags;
+	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+out:
+	mm->total_vm += len >> PAGE_SHIFT;
+	if (flags & VM_LOCKED) {
+		mm->locked_vm += len >> PAGE_SHIFT;
+		make_pages_present(addr, addr + len);
+	}
+	return addr;
+}
+
+EXPORT_SYMBOL(do_brk);
+
+/* Release all mmaps. */
+void exit_mmap(struct mm_struct *mm)
+{
+	struct mmu_gather *tlb;
+	struct vm_area_struct *vma;
+	unsigned long nr_accounted = 0;
+
+	lru_add_drain();
+
+	spin_lock(&mm->page_table_lock);
+
+	tlb = tlb_gather_mmu(mm, 1);
+	flush_cache_mm(mm);
+	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
+	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
+					~0UL, &nr_accounted, NULL);
+	vm_unacct_memory(nr_accounted);
+	BUG_ON(mm->map_count);	/* This is just debugging */
+	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
+	
+	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
+
+	vma = mm->mmap;
+	mm->mmap = mm->mmap_cache = NULL;
+	mm->mm_rb = RB_ROOT;
+	set_mm_counter(mm, rss, 0);
+	mm->total_vm = 0;
+	mm->locked_vm = 0;
+
+	spin_unlock(&mm->page_table_lock);
+
+	/*
+	 * Walk the list again, actually closing and freeing it
+	 * without holding any MM locks.
+	 */
+	while (vma) {
+		struct vm_area_struct *next = vma->vm_next;
+		remove_vm_struct(vma);
+		vma = next;
+	}
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree.  If vm_file is non-NULL
+ * then i_mmap_lock is taken here.
+ */
+int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+	struct vm_area_struct * __vma, * prev;
+	struct rb_node ** rb_link, * rb_parent;
+
+	/*
+	 * The vm_pgoff of a purely anonymous vma should be irrelevant
+	 * until its first write fault, when page's anon_vma and index
+	 * are set.  But now set the vm_pgoff it will almost certainly
+	 * end up with (unless mremap moves it elsewhere before that
+	 * first wfault), so /proc/pid/maps tells a consistent story.
+	 *
+	 * By setting it to reflect the virtual start address of the
+	 * vma, merges and splits can happen in a seamless way, just
+	 * using the existing file pgoff checks and manipulations.
+	 * Similarly in do_mmap_pgoff and in do_brk.
+	 */
+	if (!vma->vm_file) {
+		BUG_ON(vma->anon_vma);
+		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+	}
+	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
+	if (__vma && __vma->vm_start < vma->vm_end)
+		return -ENOMEM;
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+	return 0;
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+	unsigned long addr, unsigned long len, pgoff_t pgoff)
+{
+	struct vm_area_struct *vma = *vmap;
+	unsigned long vma_start = vma->vm_start;
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *new_vma, *prev;
+	struct rb_node **rb_link, *rb_parent;
+	struct mempolicy *pol;
+
+	/*
+	 * If anonymous vma has not yet been faulted, update new pgoff
+	 * to match new location, to increase its chance of merging.
+	 */
+	if (!vma->vm_file && !vma->anon_vma)
+		pgoff = addr >> PAGE_SHIFT;
+
+	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+	if (new_vma) {
+		/*
+		 * Source vma may have been merged into new_vma
+		 */
+		if (vma_start >= new_vma->vm_start &&
+		    vma_start < new_vma->vm_end)
+			*vmap = new_vma;
+	} else {
+		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (new_vma) {
+			*new_vma = *vma;
+			pol = mpol_copy(vma_policy(vma));
+			if (IS_ERR(pol)) {
+				kmem_cache_free(vm_area_cachep, new_vma);
+				return NULL;
+			}
+			vma_set_policy(new_vma, pol);
+			new_vma->vm_start = addr;
+			new_vma->vm_end = addr + len;
+			new_vma->vm_pgoff = pgoff;
+			if (new_vma->vm_file)
+				get_file(new_vma->vm_file);
+			if (new_vma->vm_ops && new_vma->vm_ops->open)
+				new_vma->vm_ops->open(new_vma);
+			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+		}
+	}
+	return new_vma;
+}
diff --git a/mm/mprotect.c b/mm/mprotect.c
new file mode 100644
index 000000000000..e9fbd013ad9a
--- /dev/null
+++ b/mm/mprotect.c
@@ -0,0 +1,282 @@
+/*
+ *  mm/mprotect.c
+ *
+ *  (C) Copyright 1994 Linus Torvalds
+ *  (C) Copyright 2002 Christoph Hellwig
+ *
+ *  Address space accounting code	<alan@redhat.com>
+ *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/mempolicy.h>
+#include <linux/personality.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pte_t *pte;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		if (pte_present(*pte)) {
+			pte_t ptent;
+
+			/* Avoid an SMP race with hardware updated dirty/clean
+			 * bits by wiping the pte and then setting the new pte
+			 * into place.
+			 */
+			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
+			set_pte_at(mm, addr, pte, ptent);
+			lazy_mmu_prot_update(ptent);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+}
+
+static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		change_pte_range(mm, pmd, addr, next, newprot);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		change_pmd_range(mm, pud, addr, next, newprot);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void change_protection(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start = addr;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		change_pud_range(mm, pgd, addr, next, newprot);
+	} while (pgd++, addr = next, addr != end);
+	flush_tlb_range(vma, start, end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+static int
+mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+	unsigned long start, unsigned long end, unsigned long newflags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long oldflags = vma->vm_flags;
+	long nrpages = (end - start) >> PAGE_SHIFT;
+	unsigned long charged = 0;
+	pgprot_t newprot;
+	pgoff_t pgoff;
+	int error;
+
+	if (newflags == oldflags) {
+		*pprev = vma;
+		return 0;
+	}
+
+	/*
+	 * If we make a private mapping writable we increase our commit;
+	 * but (without finer accounting) cannot reduce our commit if we
+	 * make it unwritable again.
+	 *
+	 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
+	 * a MAP_NORESERVE private mapping to writable will now reserve.
+	 */
+	if (newflags & VM_WRITE) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+			charged = nrpages;
+			if (security_vm_enough_memory(charged))
+				return -ENOMEM;
+			newflags |= VM_ACCOUNT;
+		}
+	}
+
+	newprot = protection_map[newflags & 0xf];
+
+	/*
+	 * First try to merge with previous and/or next vma.
+	 */
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*pprev = vma_merge(mm, *pprev, start, end, newflags,
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+	if (*pprev) {
+		vma = *pprev;
+		goto success;
+	}
+
+	*pprev = vma;
+
+	if (start != vma->vm_start) {
+		error = split_vma(mm, vma, start, 1);
+		if (error)
+			goto fail;
+	}
+
+	if (end != vma->vm_end) {
+		error = split_vma(mm, vma, end, 0);
+		if (error)
+			goto fail;
+	}
+
+success:
+	/*
+	 * vm_flags and vm_page_prot are protected by the mmap_sem
+	 * held in write mode.
+	 */
+	vma->vm_flags = newflags;
+	vma->vm_page_prot = newprot;
+	change_protection(vma, start, end, newprot);
+	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	return 0;
+
+fail:
+	vm_unacct_memory(charged);
+	return error;
+}
+
+asmlinkage long
+sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+{
+	unsigned long vm_flags, nstart, end, tmp, reqprot;
+	struct vm_area_struct *vma, *prev;
+	int error = -EINVAL;
+	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
+	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
+		return -EINVAL;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	if (!len)
+		return 0;
+	len = PAGE_ALIGN(len);
+	end = start + len;
+	if (end <= start)
+		return -ENOMEM;
+	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
+		return -EINVAL;
+
+	reqprot = prot;
+	/*
+	 * Does the application expect PROT_READ to imply PROT_EXEC:
+	 */
+	if (unlikely((prot & PROT_READ) &&
+			(current->personality & READ_IMPLIES_EXEC)))
+		prot |= PROT_EXEC;
+
+	vm_flags = calc_vm_prot_bits(prot);
+
+	down_write(&current->mm->mmap_sem);
+
+	vma = find_vma_prev(current->mm, start, &prev);
+	error = -ENOMEM;
+	if (!vma)
+		goto out;
+	if (unlikely(grows & PROT_GROWSDOWN)) {
+		if (vma->vm_start >= end)
+			goto out;
+		start = vma->vm_start;
+		error = -EINVAL;
+		if (!(vma->vm_flags & VM_GROWSDOWN))
+			goto out;
+	}
+	else {
+		if (vma->vm_start > start)
+			goto out;
+		if (unlikely(grows & PROT_GROWSUP)) {
+			end = vma->vm_end;
+			error = -EINVAL;
+			if (!(vma->vm_flags & VM_GROWSUP))
+				goto out;
+		}
+	}
+	if (start > vma->vm_start)
+		prev = vma;
+
+	for (nstart = start ; ; ) {
+		unsigned long newflags;
+
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+
+		if (is_vm_hugetlb_page(vma)) {
+			error = -EACCES;
+			goto out;
+		}
+
+		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+
+		if ((newflags & ~(newflags >> 4)) & 0xf) {
+			error = -EACCES;
+			goto out;
+		}
+
+		error = security_file_mprotect(vma, reqprot, prot);
+		if (error)
+			goto out;
+
+		tmp = vma->vm_end;
+		if (tmp > end)
+			tmp = end;
+		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+		if (error)
+			goto out;
+		nstart = tmp;
+
+		if (nstart < prev->vm_end)
+			nstart = prev->vm_end;
+		if (nstart >= end)
+			goto out;
+
+		vma = prev->vm_next;
+		if (!vma || vma->vm_start != nstart) {
+			error = -ENOMEM;
+			goto out;
+		}
+	}
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
diff --git a/mm/mremap.c b/mm/mremap.c
new file mode 100644
index 000000000000..0d1c1b9c7a0a
--- /dev/null
+++ b/mm/mremap.c
@@ -0,0 +1,426 @@
+/*
+ *	mm/mremap.c
+ *
+ *	(C) Copyright 1996 Linus Torvalds
+ *
+ *	Address space accounting code	<alan@redhat.com>
+ *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none_or_clear_bad(pgd))
+		goto end;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none_or_clear_bad(pud))
+		goto end;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none_or_clear_bad(pmd))
+		goto end;
+
+	pte = pte_offset_map_nested(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap_nested(pte);
+		pte = NULL;
+	}
+end:
+	return pte;
+}
+
+static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none_or_clear_bad(pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none_or_clear_bad(pud))
+		return NULL;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none_or_clear_bad(pmd))
+		return NULL;
+
+	return pte_offset_map(pmd, addr);
+}
+
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return NULL;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (pmd)
+		pte = pte_alloc_map(mm, pmd, addr);
+	return pte;
+}
+
+static int
+move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
+		struct vm_area_struct *new_vma, unsigned long new_addr)
+{
+	struct address_space *mapping = NULL;
+	struct mm_struct *mm = vma->vm_mm;
+	int error = 0;
+	pte_t *src, *dst;
+
+	if (vma->vm_file) {
+		/*
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, we must lock vmtruncate out,
+		 * since it might clean the dst vma before the src vma,
+		 * and we propagate stale pages into the dst afterward.
+		 */
+		mapping = vma->vm_file->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		if (new_vma->vm_truncate_count &&
+		    new_vma->vm_truncate_count != vma->vm_truncate_count)
+			new_vma->vm_truncate_count = 0;
+	}
+	spin_lock(&mm->page_table_lock);
+
+	src = get_one_pte_map_nested(mm, old_addr);
+	if (src) {
+		/*
+		 * Look to see whether alloc_one_pte_map needs to perform a
+		 * memory allocation.  If it does then we need to drop the
+		 * atomic kmap
+		 */
+		dst = get_one_pte_map(mm, new_addr);
+		if (unlikely(!dst)) {
+			pte_unmap_nested(src);
+			if (mapping)
+				spin_unlock(&mapping->i_mmap_lock);
+			dst = alloc_one_pte_map(mm, new_addr);
+			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
+				spin_unlock(&mm->page_table_lock);
+				spin_lock(&mapping->i_mmap_lock);
+				spin_lock(&mm->page_table_lock);
+			}
+			src = get_one_pte_map_nested(mm, old_addr);
+		}
+		/*
+		 * Since alloc_one_pte_map can drop and re-acquire
+		 * page_table_lock, we should re-check the src entry...
+		 */
+		if (src) {
+			if (dst) {
+				pte_t pte;
+				pte = ptep_clear_flush(vma, old_addr, src);
+				set_pte_at(mm, new_addr, dst, pte);
+			} else
+				error = -ENOMEM;
+			pte_unmap_nested(src);
+		}
+		if (dst)
+			pte_unmap(dst);
+	}
+	spin_unlock(&mm->page_table_lock);
+	if (mapping)
+		spin_unlock(&mapping->i_mmap_lock);
+	return error;
+}
+
+static unsigned long move_page_tables(struct vm_area_struct *vma,
+		unsigned long old_addr, struct vm_area_struct *new_vma,
+		unsigned long new_addr, unsigned long len)
+{
+	unsigned long offset;
+
+	flush_cache_range(vma, old_addr, old_addr + len);
+
+	/*
+	 * This is not the clever way to do this, but we're taking the
+	 * easy way out on the assumption that most remappings will be
+	 * only a few pages.. This also makes error recovery easier.
+	 */
+	for (offset = 0; offset < len; offset += PAGE_SIZE) {
+		if (move_one_page(vma, old_addr + offset,
+				new_vma, new_addr + offset) < 0)
+			break;
+		cond_resched();
+	}
+	return offset;
+}
+
+static unsigned long move_vma(struct vm_area_struct *vma,
+		unsigned long old_addr, unsigned long old_len,
+		unsigned long new_len, unsigned long new_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *new_vma;
+	unsigned long vm_flags = vma->vm_flags;
+	unsigned long new_pgoff;
+	unsigned long moved_len;
+	unsigned long excess = 0;
+	int split = 0;
+
+	/*
+	 * We'd prefer to avoid failure later on in do_munmap:
+	 * which may split one vma into three before unmapping.
+	 */
+	if (mm->map_count >= sysctl_max_map_count - 3)
+		return -ENOMEM;
+
+	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+	if (!new_vma)
+		return -ENOMEM;
+
+	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+	if (moved_len < old_len) {
+		/*
+		 * On error, move entries back from new area to old,
+		 * which will succeed since page tables still there,
+		 * and then proceed to unmap new area instead of old.
+		 */
+		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+		vma = new_vma;
+		old_len = new_len;
+		old_addr = new_addr;
+		new_addr = -ENOMEM;
+	}
+
+	/* Conceal VM_ACCOUNT so old reservation is not undone */
+	if (vm_flags & VM_ACCOUNT) {
+		vma->vm_flags &= ~VM_ACCOUNT;
+		excess = vma->vm_end - vma->vm_start - old_len;
+		if (old_addr > vma->vm_start &&
+		    old_addr + old_len < vma->vm_end)
+			split = 1;
+	}
+
+	if (do_munmap(mm, old_addr, old_len) < 0) {
+		/* OOM: unable to split vma, just get accounts right */
+		vm_unacct_memory(excess >> PAGE_SHIFT);
+		excess = 0;
+	}
+
+	/* Restore VM_ACCOUNT if one or two pieces of vma left */
+	if (excess) {
+		vma->vm_flags |= VM_ACCOUNT;
+		if (split)
+			vma->vm_next->vm_flags |= VM_ACCOUNT;
+	}
+
+	mm->total_vm += new_len >> PAGE_SHIFT;
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		if (new_len > old_len)
+			make_pages_present(new_addr + old_len,
+					   new_addr + new_len);
+	}
+
+	return new_addr;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+unsigned long do_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long ret = -EINVAL;
+	unsigned long charged = 0;
+
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+		goto out;
+
+	if (addr & ~PAGE_MASK)
+		goto out;
+
+	old_len = PAGE_ALIGN(old_len);
+	new_len = PAGE_ALIGN(new_len);
+
+	/*
+	 * We allow a zero old-len as a special case
+	 * for DOS-emu "duplicate shm area" thing. But
+	 * a zero new-len is nonsensical.
+	 */
+	if (!new_len)
+		goto out;
+
+	/* new_addr is only valid if MREMAP_FIXED is specified */
+	if (flags & MREMAP_FIXED) {
+		if (new_addr & ~PAGE_MASK)
+			goto out;
+		if (!(flags & MREMAP_MAYMOVE))
+			goto out;
+
+		if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+			goto out;
+
+		/* Check if the location we're moving into overlaps the
+		 * old location at all, and fail if it does.
+		 */
+		if ((new_addr <= addr) && (new_addr+new_len) > addr)
+			goto out;
+
+		if ((addr <= new_addr) && (addr+old_len) > new_addr)
+			goto out;
+
+		ret = do_munmap(current->mm, new_addr, new_len);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Always allow a shrinking remap: that just unmaps
+	 * the unnecessary pages..
+	 * do_munmap does all the needed commit accounting
+	 */
+	if (old_len >= new_len) {
+		ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+		if (ret && old_len != new_len)
+			goto out;
+		ret = addr;
+		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+			goto out;
+		old_len = new_len;
+	}
+
+	/*
+	 * Ok, we need to grow..  or relocate.
+	 */
+	ret = -EFAULT;
+	vma = find_vma(current->mm, addr);
+	if (!vma || vma->vm_start > addr)
+		goto out;
+	if (is_vm_hugetlb_page(vma)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/* We can't remap across vm area boundaries */
+	if (old_len > vma->vm_end - addr)
+		goto out;
+	if (vma->vm_flags & VM_DONTEXPAND) {
+		if (new_len > old_len)
+			goto out;
+	}
+	if (vma->vm_flags & VM_LOCKED) {
+		unsigned long locked, lock_limit;
+		locked = current->mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		locked += new_len - old_len;
+		ret = -EAGAIN;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			goto out;
+	}
+	ret = -ENOMEM;
+	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
+		goto out;
+
+	if (vma->vm_flags & VM_ACCOUNT) {
+		charged = (new_len - old_len) >> PAGE_SHIFT;
+		if (security_vm_enough_memory(charged))
+			goto out_nc;
+	}
+
+	/* old_len exactly to the end of the area..
+	 * And we're not relocating the area.
+	 */
+	if (old_len == vma->vm_end - addr &&
+	    !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
+	    (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
+		unsigned long max_addr = TASK_SIZE;
+		if (vma->vm_next)
+			max_addr = vma->vm_next->vm_start;
+		/* can we just expand the current mapping? */
+		if (max_addr - addr >= new_len) {
+			int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+			vma_adjust(vma, vma->vm_start,
+				addr + new_len, vma->vm_pgoff, NULL);
+
+			current->mm->total_vm += pages;
+			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+							vma->vm_file, pages);
+			if (vma->vm_flags & VM_LOCKED) {
+				current->mm->locked_vm += pages;
+				make_pages_present(addr + old_len,
+						   addr + new_len);
+			}
+			ret = addr;
+			goto out;
+		}
+	}
+
+	/*
+	 * We weren't able to just expand or shrink the area,
+	 * we need to create a new one and move it..
+	 */
+	ret = -ENOMEM;
+	if (flags & MREMAP_MAYMOVE) {
+		if (!(flags & MREMAP_FIXED)) {
+			unsigned long map_flags = 0;
+			if (vma->vm_flags & VM_MAYSHARE)
+				map_flags |= MAP_SHARED;
+
+			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+						vma->vm_pgoff, map_flags);
+			ret = new_addr;
+			if (new_addr & ~PAGE_MASK)
+				goto out;
+		}
+		ret = move_vma(vma, addr, old_len, new_len, new_addr);
+	}
+out:
+	if (ret & ~PAGE_MASK)
+		vm_unacct_memory(charged);
+out_nc:
+	return ret;
+}
+
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	unsigned long ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
diff --git a/mm/msync.c b/mm/msync.c
new file mode 100644
index 000000000000..090f426bca7d
--- /dev/null
+++ b/mm/msync.c
@@ -0,0 +1,236 @@
+/*
+ *	linux/mm/msync.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * The msync() system call.
+ */
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+
+static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		unsigned long pfn;
+		struct page *page;
+
+		if (!pte_present(*pte))
+			continue;
+		pfn = pte_pfn(*pte);
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		if (PageReserved(page))
+			continue;
+
+		if (ptep_clear_flush_dirty(vma, addr, pte) ||
+		    page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+}
+
+static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		sync_pte_range(vma, pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		sync_pmd_range(vma, pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void sync_page_range(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	unsigned long next;
+
+	/* For hugepages we can't go walking the page table normally,
+	 * but that's ok, hugetlbfs is memory based, so we don't need
+	 * to do anything more on an msync() */
+	if (is_vm_hugetlb_page(vma))
+		return;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		sync_pud_range(vma, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+#ifdef CONFIG_PREEMPT
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	const size_t chunk = 64 * 1024;	/* bytes */
+	unsigned long next;
+
+	do {
+		next = addr + chunk;
+		if (next > end || next < addr)
+			next = end;
+		sync_page_range(vma, addr, next);
+		cond_resched();
+	} while (addr = next, addr != end);
+}
+#else
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	sync_page_range(vma, addr, end);
+}
+#endif
+
+/*
+ * MS_SYNC syncs the entire file - including mappings.
+ *
+ * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
+ * marks the relevant pages dirty.  The application may now run fsync() to
+ * write out the dirty pages and wait on the writeout and check the result.
+ * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
+ * async writeout immediately.
+ * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * applications.
+ */
+static int msync_interval(struct vm_area_struct *vma,
+			unsigned long addr, unsigned long end, int flags)
+{
+	int ret = 0;
+	struct file *file = vma->vm_file;
+
+	if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
+		return -EBUSY;
+
+	if (file && (vma->vm_flags & VM_SHARED)) {
+		filemap_sync(vma, addr, end);
+
+		if (flags & MS_SYNC) {
+			struct address_space *mapping = file->f_mapping;
+			int err;
+
+			ret = filemap_fdatawrite(mapping);
+			if (file->f_op && file->f_op->fsync) {
+				/*
+				 * We don't take i_sem here because mmap_sem
+				 * is already held.
+				 */
+				err = file->f_op->fsync(file,file->f_dentry,1);
+				if (err && !ret)
+					ret = err;
+			}
+			err = filemap_fdatawait(mapping);
+			if (!ret)
+				ret = err;
+		}
+	}
+	return ret;
+}
+
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+{
+	unsigned long end;
+	struct vm_area_struct *vma;
+	int unmapped_error, error = -EINVAL;
+
+	if (flags & MS_SYNC)
+		current->flags |= PF_SYNCWRITE;
+
+	down_read(&current->mm->mmap_sem);
+	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+		goto out;
+	if (start & ~PAGE_MASK)
+		goto out;
+	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+		goto out;
+	error = -ENOMEM;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+	error = 0;
+	if (end == start)
+		goto out;
+	/*
+	 * If the interval [start,end) covers some unmapped address ranges,
+	 * just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	unmapped_error = 0;
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = msync_interval(vma, start, end, flags);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = msync_interval(vma, start, vma->vm_end, flags);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+out:
+	up_read(&current->mm->mmap_sem);
+	current->flags &= ~PF_SYNCWRITE;
+	return error;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
new file mode 100644
index 000000000000..b293ec1cc4e6
--- /dev/null
+++ b/mm/nommu.c
@@ -0,0 +1,1180 @@
+/*
+ *  linux/mm/nommu.c
+ *
+ *  Replacement code for mm functions to support CPU's that don't
+ *  have any form of memory management unit (thus no virtual memory).
+ *
+ *  See Documentation/nommu-mmap.txt
+ *
+ *  Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
+ *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
+ *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+void *high_memory;
+struct page *mem_map;
+unsigned long max_mapnr;
+unsigned long num_physpages;
+unsigned long askedalloc, realalloc;
+atomic_t vm_committed_space = ATOMIC_INIT(0);
+int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int heap_stack_gap = 0;
+
+EXPORT_SYMBOL(mem_map);
+EXPORT_SYMBOL(sysctl_max_map_count);
+EXPORT_SYMBOL(sysctl_overcommit_memory);
+EXPORT_SYMBOL(sysctl_overcommit_ratio);
+EXPORT_SYMBOL(vm_committed_space);
+EXPORT_SYMBOL(__vm_enough_memory);
+
+/* list of shareable VMAs */
+struct rb_root nommu_vma_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_vma_sem);
+
+struct vm_operations_struct generic_file_vm_ops = {
+};
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long limit;
+
+	if (inode->i_size < offset)
+		goto do_expand;
+	i_size_write(inode, offset);
+
+	truncate_inode_pages(mapping, offset);
+	goto out_truncate;
+
+do_expand:
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && offset > limit)
+		goto out_sig;
+	if (offset > inode->i_sb->s_maxbytes)
+		goto out;
+	i_size_write(inode, offset);
+
+out_truncate:
+	if (inode->i_op && inode->i_op->truncate)
+		inode->i_op->truncate(inode);
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out:
+	return -EFBIG;
+}
+
+EXPORT_SYMBOL(vmtruncate);
+
+/*
+ * Return the total memory allocated for this pointer, not
+ * just what the caller asked for.
+ *
+ * Doesn't have to be accurate, i.e. may have races.
+ */
+unsigned int kobjsize(const void *objp)
+{
+	struct page *page;
+
+	if (!objp || !((page = virt_to_page(objp))))
+		return 0;
+
+	if (PageSlab(page))
+		return ksize(objp);
+
+	BUG_ON(page->index < 0);
+	BUG_ON(page->index >= MAX_ORDER);
+
+	return (PAGE_SIZE << page->index);
+}
+
+/*
+ * The nommu dodgy version :-)
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+	unsigned long start, int len, int write, int force,
+	struct page **pages, struct vm_area_struct **vmas)
+{
+	int i;
+	static struct vm_area_struct dummy_vma;
+
+	for (i = 0; i < len; i++) {
+		if (pages) {
+			pages[i] = virt_to_page(start);
+			if (pages[i])
+				page_cache_get(pages[i]);
+		}
+		if (vmas)
+			vmas[i] = &dummy_vma;
+		start += PAGE_SIZE;
+	}
+	return(i);
+}
+
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+void vfree(void *addr)
+{
+	kfree(addr);
+}
+
+void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
+{
+	/*
+	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
+	 */
+	return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+}
+
+struct page * vmalloc_to_page(void *addr)
+{
+	return virt_to_page(addr);
+}
+
+unsigned long vmalloc_to_pfn(void *addr)
+{
+	return page_to_pfn(virt_to_page(addr));
+}
+
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+	memcpy(buf, addr, count);
+	return count;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+	/* Don't allow overflow */
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
+
+	memcpy(addr, buf, count);
+	return(count);
+}
+
+/*
+ *	vmalloc  -  allocate virtually continguos memory
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into continguos kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+/*
+ *	vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough 32bit PA addressable pages to cover @size from the
+ *	page level allocator and map them into continguos kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+
+void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
+{
+	BUG();
+	return NULL;
+}
+
+void vunmap(void *addr)
+{
+	BUG();
+}
+
+/*
+ *  sys_brk() for the most part doesn't need the global kernel
+ *  lock, except when an application is doing something nasty
+ *  like trying to un-brk an area that has already been mapped
+ *  to a regular file.  in this case, the unmapping will need
+ *  to invoke file system routines that need the global lock.
+ */
+asmlinkage unsigned long sys_brk(unsigned long brk)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (brk < mm->start_brk || brk > mm->context.end_brk)
+		return mm->brk;
+
+	if (mm->brk == brk)
+		return mm->brk;
+
+	/*
+	 * Always allow shrinking brk
+	 */
+	if (brk <= mm->brk) {
+		mm->brk = brk;
+		return brk;
+	}
+
+	/*
+	 * Ok, looks good - let it rip.
+	 */
+	return mm->brk = brk;
+}
+
+#ifdef DEBUG
+static void show_process_blocks(void)
+{
+	struct vm_list_struct *vml;
+
+	printk("Process blocks %d:", current->pid);
+
+	for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
+		printk(" %p: %p", vml, vml->vma);
+		if (vml->vma)
+			printk(" (%d @%lx #%d)",
+			       kobjsize((void *) vml->vma->vm_start),
+			       vml->vma->vm_start,
+			       atomic_read(&vml->vma->vm_usage));
+		printk(vml->next ? " ->" : ".\n");
+	}
+}
+#endif /* DEBUG */
+
+static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *n = nommu_vma_tree.rb_node;
+
+	while (n) {
+		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+
+		if (start < vma->vm_start)
+			n = n->rb_left;
+		else if (start > vma->vm_start)
+			n = n->rb_right;
+		else
+			return vma;
+	}
+
+	return NULL;
+}
+
+static void add_nommu_vma(struct vm_area_struct *vma)
+{
+	struct vm_area_struct *pvma;
+	struct address_space *mapping;
+	struct rb_node **p = &nommu_vma_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	/* add the VMA to the mapping */
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+
+		flush_dcache_mmap_lock(mapping);
+		vma_prio_tree_insert(vma, &mapping->i_mmap);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	/* add the VMA to the master list */
+	while (*p) {
+		parent = *p;
+		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
+
+		if (vma->vm_start < pvma->vm_start) {
+			p = &(*p)->rb_left;
+		}
+		else if (vma->vm_start > pvma->vm_start) {
+			p = &(*p)->rb_right;
+		}
+		else {
+			/* mappings are at the same address - this can only
+			 * happen for shared-mem chardevs and shared file
+			 * mappings backed by ramfs/tmpfs */
+			BUG_ON(!(pvma->vm_flags & VM_SHARED));
+
+			if (vma < pvma)
+				p = &(*p)->rb_left;
+			else if (vma > pvma)
+				p = &(*p)->rb_right;
+			else
+				BUG();
+		}
+	}
+
+	rb_link_node(&vma->vm_rb, parent, p);
+	rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+}
+
+static void delete_nommu_vma(struct vm_area_struct *vma)
+{
+	struct address_space *mapping;
+
+	/* remove the VMA from the mapping */
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+
+		flush_dcache_mmap_lock(mapping);
+		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	/* remove from the master list */
+	rb_erase(&vma->vm_rb, &nommu_vma_tree);
+}
+
+/*
+ * determine whether a mapping should be permitted and, if so, what sort of
+ * mapping we're capable of supporting
+ */
+static int validate_mmap_request(struct file *file,
+				 unsigned long addr,
+				 unsigned long len,
+				 unsigned long prot,
+				 unsigned long flags,
+				 unsigned long pgoff,
+				 unsigned long *_capabilities)
+{
+	unsigned long capabilities;
+	unsigned long reqprot = prot;
+	int ret;
+
+	/* do the simple checks first */
+	if (flags & MAP_FIXED || addr) {
+		printk(KERN_DEBUG
+		       "%d: Can't do fixed-address/overlay mmap of RAM\n",
+		       current->pid);
+		return -EINVAL;
+	}
+
+	if ((flags & MAP_TYPE) != MAP_PRIVATE &&
+	    (flags & MAP_TYPE) != MAP_SHARED)
+		return -EINVAL;
+
+	if (PAGE_ALIGN(len) == 0)
+		return addr;
+
+	if (len > TASK_SIZE)
+		return -EINVAL;
+
+	/* offset overflow? */
+	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+		return -EINVAL;
+
+	if (file) {
+		/* validate file mapping requests */
+		struct address_space *mapping;
+
+		/* files must support mmap */
+		if (!file->f_op || !file->f_op->mmap)
+			return -ENODEV;
+
+		/* work out if what we've got could possibly be shared
+		 * - we support chardevs that provide their own "memory"
+		 * - we support files/blockdevs that are memory backed
+		 */
+		mapping = file->f_mapping;
+		if (!mapping)
+			mapping = file->f_dentry->d_inode->i_mapping;
+
+		capabilities = 0;
+		if (mapping && mapping->backing_dev_info)
+			capabilities = mapping->backing_dev_info->capabilities;
+
+		if (!capabilities) {
+			/* no explicit capabilities set, so assume some
+			 * defaults */
+			switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+			case S_IFREG:
+			case S_IFBLK:
+				capabilities = BDI_CAP_MAP_COPY;
+				break;
+
+			case S_IFCHR:
+				capabilities =
+					BDI_CAP_MAP_DIRECT |
+					BDI_CAP_READ_MAP |
+					BDI_CAP_WRITE_MAP;
+				break;
+
+			default:
+				return -EINVAL;
+			}
+		}
+
+		/* eliminate any capabilities that we can't support on this
+		 * device */
+		if (!file->f_op->get_unmapped_area)
+			capabilities &= ~BDI_CAP_MAP_DIRECT;
+		if (!file->f_op->read)
+			capabilities &= ~BDI_CAP_MAP_COPY;
+
+		if (flags & MAP_SHARED) {
+			/* do checks for writing, appending and locking */
+			if ((prot & PROT_WRITE) &&
+			    !(file->f_mode & FMODE_WRITE))
+				return -EACCES;
+
+			if (IS_APPEND(file->f_dentry->d_inode) &&
+			    (file->f_mode & FMODE_WRITE))
+				return -EACCES;
+
+			if (locks_verify_locked(file->f_dentry->d_inode))
+				return -EAGAIN;
+
+			if (!(capabilities & BDI_CAP_MAP_DIRECT))
+				return -ENODEV;
+
+			if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
+			    ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+			    ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+			    ) {
+				printk("MAP_SHARED not completely supported on !MMU\n");
+				return -EINVAL;
+			}
+
+			/* we mustn't privatise shared mappings */
+			capabilities &= ~BDI_CAP_MAP_COPY;
+		}
+		else {
+			/* we're going to read the file into private memory we
+			 * allocate */
+			if (!(capabilities & BDI_CAP_MAP_COPY))
+				return -ENODEV;
+
+			/* we don't permit a private writable mapping to be
+			 * shared with the backing device */
+			if (prot & PROT_WRITE)
+				capabilities &= ~BDI_CAP_MAP_DIRECT;
+		}
+
+		/* handle executable mappings and implied executable
+		 * mappings */
+		if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+			if (prot & PROT_EXEC)
+				return -EPERM;
+		}
+		else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+			/* handle implication of PROT_EXEC by PROT_READ */
+			if (current->personality & READ_IMPLIES_EXEC) {
+				if (capabilities & BDI_CAP_EXEC_MAP)
+					prot |= PROT_EXEC;
+			}
+		}
+		else if ((prot & PROT_READ) &&
+			 (prot & PROT_EXEC) &&
+			 !(capabilities & BDI_CAP_EXEC_MAP)
+			 ) {
+			/* backing file is not executable, try to copy */
+			capabilities &= ~BDI_CAP_MAP_DIRECT;
+		}
+	}
+	else {
+		/* anonymous mappings are always memory backed and can be
+		 * privately mapped
+		 */
+		capabilities = BDI_CAP_MAP_COPY;
+
+		/* handle PROT_EXEC implication by PROT_READ */
+		if ((prot & PROT_READ) &&
+		    (current->personality & READ_IMPLIES_EXEC))
+			prot |= PROT_EXEC;
+	}
+
+	/* allow the security API to have its say */
+	ret = security_file_mmap(file, reqprot, prot, flags);
+	if (ret < 0)
+		return ret;
+
+	/* looks okay */
+	*_capabilities = capabilities;
+	return 0;
+}
+
+/*
+ * we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags
+ */
+static unsigned long determine_vm_flags(struct file *file,
+					unsigned long prot,
+					unsigned long flags,
+					unsigned long capabilities)
+{
+	unsigned long vm_flags;
+
+	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+	vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	/* vm_flags |= mm->def_flags; */
+
+	if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+		/* attempt to share read-only copies of mapped file chunks */
+		if (file && !(prot & PROT_WRITE))
+			vm_flags |= VM_MAYSHARE;
+	}
+	else {
+		/* overlay a shareable mapping on the backing device or inode
+		 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
+		 * romfs/cramfs */
+		if (flags & MAP_SHARED)
+			vm_flags |= VM_MAYSHARE | VM_SHARED;
+		else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
+			vm_flags |= VM_MAYSHARE;
+	}
+
+	/* refuse to let anyone share private mappings with this process if
+	 * it's being traced - otherwise breakpoints set in it may interfere
+	 * with another untraced process
+	 */
+	if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+		vm_flags &= ~VM_MAYSHARE;
+
+	return vm_flags;
+}
+
+/*
+ * set up a shared mapping on a file
+ */
+static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+{
+	int ret;
+
+	ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+	if (ret != -ENOSYS)
+		return ret;
+
+	/* getting an ENOSYS error indicates that direct mmap isn't
+	 * possible (as opposed to tried but failed) so we'll fall
+	 * through to making a private copy of the data and mapping
+	 * that if we can */
+	return -ENODEV;
+}
+
+/*
+ * set up a private mapping or an anonymous shared mapping
+ */
+static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+{
+	void *base;
+	int ret;
+
+	/* invoke the file's mapping function so that it can keep track of
+	 * shared mappings on devices or memory
+	 * - VM_MAYSHARE will be set if it may attempt to share
+	 */
+	if (vma->vm_file) {
+		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+		if (ret != -ENOSYS) {
+			/* shouldn't return success if we're not sharing */
+			BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
+			return ret; /* success or a real error */
+		}
+
+		/* getting an ENOSYS error indicates that direct mmap isn't
+		 * possible (as opposed to tried but failed) so we'll try to
+		 * make a private copy of the data and map that instead */
+	}
+
+	/* allocate some memory to hold the mapping
+	 * - note that this may not return a page-aligned address if the object
+	 *   we're allocating is smaller than a page
+	 */
+	base = kmalloc(len, GFP_KERNEL);
+	if (!base)
+		goto enomem;
+
+	vma->vm_start = (unsigned long) base;
+	vma->vm_end = vma->vm_start + len;
+	vma->vm_flags |= VM_MAPPED_COPY;
+
+#ifdef WARN_ON_SLACK
+	if (len + WARN_ON_SLACK <= kobjsize(result))
+		printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
+		       len, current->pid, kobjsize(result) - len);
+#endif
+
+	if (vma->vm_file) {
+		/* read the contents of a file into the copy */
+		mm_segment_t old_fs;
+		loff_t fpos;
+
+		fpos = vma->vm_pgoff;
+		fpos <<= PAGE_SHIFT;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+		set_fs(old_fs);
+
+		if (ret < 0)
+			goto error_free;
+
+		/* clear the last little bit */
+		if (ret < len)
+			memset(base + ret, 0, len - ret);
+
+	} else {
+		/* if it's an anonymous mapping, then just clear it */
+		memset(base, 0, len);
+	}
+
+	return 0;
+
+error_free:
+	kfree(base);
+	vma->vm_start = 0;
+	return ret;
+
+enomem:
+	printk("Allocation of length %lu from process %d failed\n",
+	       len, current->pid);
+	show_free_areas();
+	return -ENOMEM;
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
+unsigned long do_mmap_pgoff(struct file *file,
+			    unsigned long addr,
+			    unsigned long len,
+			    unsigned long prot,
+			    unsigned long flags,
+			    unsigned long pgoff)
+{
+	struct vm_list_struct *vml = NULL;
+	struct vm_area_struct *vma = NULL;
+	struct rb_node *rb;
+	unsigned long capabilities, vm_flags;
+	void *result;
+	int ret;
+
+	/* decide whether we should attempt the mapping, and if so what sort of
+	 * mapping */
+	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
+				    &capabilities);
+	if (ret < 0)
+		return ret;
+
+	/* we've determined that we can make the mapping, now translate what we
+	 * now know into VMA flags */
+	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
+	/* we're going to need to record the mapping if it works */
+	vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+	if (!vml)
+		goto error_getting_vml;
+	memset(vml, 0, sizeof(*vml));
+
+	down_write(&nommu_vma_sem);
+
+	/* if we want to share, we need to check for VMAs created by other
+	 * mmap() calls that overlap with our proposed mapping
+	 * - we can only share with an exact match on most regular files
+	 * - shared mappings on character devices and memory backed files are
+	 *   permitted to overlap inexactly as far as we are concerned for in
+	 *   these cases, sharing is handled in the driver or filesystem rather
+	 *   than here
+	 */
+	if (vm_flags & VM_MAYSHARE) {
+		unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long vmpglen;
+
+		for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
+			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+
+			if (!(vma->vm_flags & VM_MAYSHARE))
+				continue;
+
+			/* search for overlapping mappings on the same file */
+			if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+				continue;
+
+			if (vma->vm_pgoff >= pgoff + pglen)
+				continue;
+
+			vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
+			vmpglen >>= PAGE_SHIFT;
+			if (pgoff >= vma->vm_pgoff + vmpglen)
+				continue;
+
+			/* handle inexactly overlapping matches between mappings */
+			if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+				if (!(capabilities & BDI_CAP_MAP_DIRECT))
+					goto sharing_violation;
+				continue;
+			}
+
+			/* we've found a VMA we can share */
+			atomic_inc(&vma->vm_usage);
+
+			vml->vma = vma;
+			result = (void *) vma->vm_start;
+			goto shared;
+		}
+
+		vma = NULL;
+
+		/* obtain the address at which to make a shared mapping
+		 * - this is the hook for quasi-memory character devices to
+		 *   tell us the location of a shared mapping
+		 */
+		if (file && file->f_op->get_unmapped_area) {
+			addr = file->f_op->get_unmapped_area(file, addr, len,
+							     pgoff, flags);
+			if (IS_ERR((void *) addr)) {
+				ret = addr;
+				if (ret != (unsigned long) -ENOSYS)
+					goto error;
+
+				/* the driver refused to tell us where to site
+				 * the mapping so we'll have to attempt to copy
+				 * it */
+				ret = (unsigned long) -ENODEV;
+				if (!(capabilities & BDI_CAP_MAP_COPY))
+					goto error;
+
+				capabilities &= ~BDI_CAP_MAP_DIRECT;
+			}
+		}
+	}
+
+	/* we're going to need a VMA struct as well */
+	vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+	if (!vma)
+		goto error_getting_vma;
+
+	memset(vma, 0, sizeof(*vma));
+	INIT_LIST_HEAD(&vma->anon_vma_node);
+	atomic_set(&vma->vm_usage, 1);
+	if (file)
+		get_file(file);
+	vma->vm_file	= file;
+	vma->vm_flags	= vm_flags;
+	vma->vm_start	= addr;
+	vma->vm_end	= addr + len;
+	vma->vm_pgoff	= pgoff;
+
+	vml->vma = vma;
+
+	/* set up the mapping */
+	if (file && vma->vm_flags & VM_SHARED)
+		ret = do_mmap_shared_file(vma, len);
+	else
+		ret = do_mmap_private(vma, len);
+	if (ret < 0)
+		goto error;
+
+	/* okay... we have a mapping; now we have to register it */
+	result = (void *) vma->vm_start;
+
+	if (vma->vm_flags & VM_MAPPED_COPY) {
+		realalloc += kobjsize(result);
+		askedalloc += len;
+	}
+
+	realalloc += kobjsize(vma);
+	askedalloc += sizeof(*vma);
+
+	current->mm->total_vm += len >> PAGE_SHIFT;
+
+	add_nommu_vma(vma);
+
+ shared:
+	realalloc += kobjsize(vml);
+	askedalloc += sizeof(*vml);
+
+	vml->next = current->mm->context.vmlist;
+	current->mm->context.vmlist = vml;
+
+	up_write(&nommu_vma_sem);
+
+	if (prot & PROT_EXEC)
+		flush_icache_range((unsigned long) result,
+				   (unsigned long) result + len);
+
+#ifdef DEBUG
+	printk("do_mmap:\n");
+	show_process_blocks();
+#endif
+
+	return (unsigned long) result;
+
+ error:
+	up_write(&nommu_vma_sem);
+	kfree(vml);
+	if (vma) {
+		fput(vma->vm_file);
+		kfree(vma);
+	}
+	return ret;
+
+ sharing_violation:
+	up_write(&nommu_vma_sem);
+	printk("Attempt to share mismatched mappings\n");
+	kfree(vml);
+	return -EINVAL;
+
+ error_getting_vma:
+	up_write(&nommu_vma_sem);
+	kfree(vml);
+	printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+	       len, current->pid);
+	show_free_areas();
+	return -ENOMEM;
+
+ error_getting_vml:
+	printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+	       len, current->pid);
+	show_free_areas();
+	return -ENOMEM;
+}
+
+/*
+ * handle mapping disposal for uClinux
+ */
+static void put_vma(struct vm_area_struct *vma)
+{
+	if (vma) {
+		down_write(&nommu_vma_sem);
+
+		if (atomic_dec_and_test(&vma->vm_usage)) {
+			delete_nommu_vma(vma);
+
+			if (vma->vm_ops && vma->vm_ops->close)
+				vma->vm_ops->close(vma);
+
+			/* IO memory and memory shared directly out of the pagecache from
+			 * ramfs/tmpfs mustn't be released here */
+			if (vma->vm_flags & VM_MAPPED_COPY) {
+				realalloc -= kobjsize((void *) vma->vm_start);
+				askedalloc -= vma->vm_end - vma->vm_start;
+				kfree((void *) vma->vm_start);
+			}
+
+			realalloc -= kobjsize(vma);
+			askedalloc -= sizeof(*vma);
+
+			if (vma->vm_file)
+				fput(vma->vm_file);
+			kfree(vma);
+		}
+
+		up_write(&nommu_vma_sem);
+	}
+}
+
+int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+{
+	struct vm_list_struct *vml, **parent;
+	unsigned long end = addr + len;
+
+#ifdef DEBUG
+	printk("do_munmap:\n");
+#endif
+
+	for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
+		if ((*parent)->vma->vm_start == addr &&
+		    (*parent)->vma->vm_end == end)
+			goto found;
+
+	printk("munmap of non-mmaped memory by process %d (%s): %p\n",
+	       current->pid, current->comm, (void *) addr);
+	return -EINVAL;
+
+ found:
+	vml = *parent;
+
+	put_vma(vml->vma);
+
+	*parent = vml->next;
+	realalloc -= kobjsize(vml);
+	askedalloc -= sizeof(*vml);
+	kfree(vml);
+	mm->total_vm -= len >> PAGE_SHIFT;
+
+#ifdef DEBUG
+	show_process_blocks();
+#endif
+
+	return 0;
+}
+
+/* Release all mmaps. */
+void exit_mmap(struct mm_struct * mm)
+{
+	struct vm_list_struct *tmp;
+
+	if (mm) {
+#ifdef DEBUG
+		printk("Exit_mmap:\n");
+#endif
+
+		mm->total_vm = 0;
+
+		while ((tmp = mm->context.vmlist)) {
+			mm->context.vmlist = tmp->next;
+			put_vma(tmp->vma);
+
+			realalloc -= kobjsize(tmp);
+			askedalloc -= sizeof(*tmp);
+			kfree(tmp);
+		}
+
+#ifdef DEBUG
+		show_process_blocks();
+#endif
+	}
+}
+
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+	ret = do_munmap(mm, addr, len);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+	return -ENOMEM;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ *
+ * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
+ * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
+ */
+unsigned long do_mremap(unsigned long addr,
+			unsigned long old_len, unsigned long new_len,
+			unsigned long flags, unsigned long new_addr)
+{
+	struct vm_list_struct *vml = NULL;
+
+	/* insanity checks first */
+	if (new_len == 0)
+		return (unsigned long) -EINVAL;
+
+	if (flags & MREMAP_FIXED && new_addr != addr)
+		return (unsigned long) -EINVAL;
+
+	for (vml = current->mm->context.vmlist; vml; vml = vml->next)
+		if (vml->vma->vm_start == addr)
+			goto found;
+
+	return (unsigned long) -EINVAL;
+
+ found:
+	if (vml->vma->vm_end != vml->vma->vm_start + old_len)
+		return (unsigned long) -EFAULT;
+
+	if (vml->vma->vm_flags & VM_MAYSHARE)
+		return (unsigned long) -EPERM;
+
+	if (new_len > kobjsize((void *) addr))
+		return (unsigned long) -ENOMEM;
+
+	/* all checks complete - do it */
+	vml->vma->vm_end = vml->vma->vm_start + new_len;
+
+	askedalloc -= old_len;
+	askedalloc += new_len;
+
+	return vml->vma->vm_start;
+}
+
+/*
+ * Look up the first VMA which satisfies  addr < vm_end,  NULL if none
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_list_struct *vml;
+
+	for (vml = mm->context.vmlist; vml; vml = vml->next)
+		if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
+			return vml->vma;
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(find_vma);
+
+struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+{
+	return NULL;
+}
+
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return NULL;
+}
+
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
+		unsigned long to, unsigned long size, pgprot_t prot)
+{
+	return -EPERM;
+}
+
+void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+
+unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	return -ENOMEM;
+}
+
+void arch_unmap_area(struct vm_area_struct *area)
+{
+}
+
+void update_mem_hiwater(struct task_struct *tsk)
+{
+	unsigned long rss = get_mm_counter(tsk->mm, rss);
+
+	if (likely(tsk->mm)) {
+		if (tsk->mm->hiwater_rss < rss)
+			tsk->mm->hiwater_rss = rss;
+		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
+			tsk->mm->hiwater_vm = tsk->mm->total_vm;
+	}
+}
+
+void unmap_mapping_range(struct address_space *mapping,
+			 loff_t const holebegin, loff_t const holelen,
+			 int even_cows)
+{
+}
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(long pages, int cap_sys_admin)
+{
+	unsigned long free, allowed;
+
+	vm_acct_memory(pages);
+
+	/*
+	 * Sometimes we want to use more memory than we have
+	 */
+	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+		return 0;
+
+	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+		unsigned long n;
+
+		free = get_page_cache_size();
+		free += nr_swap_pages;
+
+		/*
+		 * Any slabs which are created with the
+		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+		 * which are reclaimable, under pressure.  The dentry
+		 * cache and most inode caches should fall into this
+		 */
+		free += atomic_read(&slab_reclaim_pages);
+
+		/*
+		 * Leave the last 3% for root
+		 */
+		if (!cap_sys_admin)
+			free -= free / 32;
+
+		if (free > pages)
+			return 0;
+
+		/*
+		 * nr_free_pages() is very expensive on large systems,
+		 * only call if we're about to fail.
+		 */
+		n = nr_free_pages();
+		if (!cap_sys_admin)
+			n -= n / 32;
+		free += n;
+
+		if (free > pages)
+			return 0;
+		vm_unacct_memory(pages);
+		return -ENOMEM;
+	}
+
+	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	/*
+	 * Leave the last 3% for root
+	 */
+	if (!cap_sys_admin)
+		allowed -= allowed / 32;
+	allowed += total_swap_pages;
+
+	/* Don't let a single process grow too big:
+	   leave 3% of the size of this process for other processes */
+	allowed -= current->mm->total_vm / 32;
+
+	if (atomic_read(&vm_committed_space) < allowed)
+		return 0;
+
+	vm_unacct_memory(pages);
+
+	return -ENOMEM;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 000000000000..9595a0f6c4b8
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,292 @@
+/*
+ *  linux/mm/oom_kill.c
+ * 
+ *  Copyright (C)  1998,2000  Rik van Riel
+ *	Thanks go out to Claus Fischer for some serious inspiration and
+ *	for goading me into coding this file...
+ *
+ *  The routines in this file are used to kill a process when
+ *  we're seriously out of memory. This gets called from kswapd()
+ *  in linux/mm/vmscan.c when we really run out of memory.
+ *
+ *  Since we won't call these routines often (on a well-configured
+ *  machine) this file will double as a 'coding guide' and a signpost
+ *  for newbie kernel hackers. It features several pointers to major
+ *  kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+
+/* #define DEBUG */
+
+/**
+ * oom_badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ *    algorithm has been meticulously tuned to meet the principle
+ *    of least surprise ... (be careful when you change it)
+ */
+
+unsigned long badness(struct task_struct *p, unsigned long uptime)
+{
+	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
+
+	if (!p->mm)
+		return 0;
+
+	/*
+	 * The memory size of the process is the basis for the badness.
+	 */
+	points = p->mm->total_vm;
+
+	/*
+	 * Processes which fork a lot of child processes are likely
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
+	 */
+	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
+		>> (SHIFT_HZ + 3);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
+
+	s = int_sqrt(cpu_time);
+	if (s)
+		points /= s;
+	s = int_sqrt(int_sqrt(run_time));
+	if (s)
+		points /= s;
+
+	/*
+	 * Niced processes are most likely less important, so double
+	 * their badness points.
+	 */
+	if (task_nice(p) > 0)
+		points *= 2;
+
+	/*
+	 * Superuser processes are usually more important, so we make it
+	 * less likely that we kill those.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+				p->uid == 0 || p->euid == 0)
+		points /= 4;
+
+	/*
+	 * We don't want to kill a process with direct hardware access.
+	 * Not only could that mess up the hardware, but usually users
+	 * tend to only have this flag set on applications they think
+	 * of as important.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+		points /= 4;
+
+	/*
+	 * Adjust the score by oomkilladj.
+	 */
+	if (p->oomkilladj) {
+		if (p->oomkilladj > 0)
+			points <<= p->oomkilladj;
+		else
+			points >>= -(p->oomkilladj);
+	}
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+	p->pid, p->comm, points);
+#endif
+	return points;
+}
+
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We expect the caller will lock the tasklist.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct * select_bad_process(void)
+{
+	unsigned long maxpoints = 0;
+	struct task_struct *g, *p;
+	struct task_struct *chosen = NULL;
+	struct timespec uptime;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	do_each_thread(g, p)
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
+			    !(p->flags & PF_DEAD))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
+			if (points > maxpoints || !chosen) {
+				chosen = p;
+				maxpoints = points;
+			}
+		}
+	while_each_thread(g, p);
+	return chosen;
+}
+
+/**
+ * We must be careful though to never send SIGKILL a process with
+ * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
+ * we select a process with CAP_SYS_RAW_IO set).
+ */
+static void __oom_kill_task(task_t *p)
+{
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
+	task_lock(p);
+	if (!p->mm || p->mm == &init_mm) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill an mm-less task!\n");
+		task_unlock(p);
+		return;
+	}
+	task_unlock(p);
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+
+	/*
+	 * We give our sacrificial lamb high priority and access to
+	 * all the memory it needs. That way it should be able to
+	 * exit() and clear out its resources quickly...
+	 */
+	p->time_slice = HZ;
+	set_tsk_thread_flag(p, TIF_MEMDIE);
+
+	force_sig(SIGKILL, p);
+}
+
+static struct mm_struct *oom_kill_task(task_t *p)
+{
+	struct mm_struct *mm = get_task_mm(p);
+	task_t * g, * q;
+
+	if (!mm)
+		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
+	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
+	return mm;
+}
+
+static struct mm_struct *oom_kill_process(struct task_struct *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
+
+/**
+ * oom_kill - kill the "best" process when we run out of memory
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ */
+void out_of_memory(unsigned int __nocast gfp_mask)
+{
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
+	read_lock(&tasklist_lock);
+retry:
+	p = select_bad_process();
+
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		show_free_areas();
+		panic("Out of memory and no killable processes...\n");
+	}
+
+	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	show_free_areas();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
+
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
+
+	/*
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
+	 */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
new file mode 100644
index 000000000000..6ddd6a29c73b
--- /dev/null
+++ b/mm/page-writeback.c
@@ -0,0 +1,819 @@
+/*
+ * mm/page-writeback.c.
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002	akpm@zip.com.au
+ *		Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/mpage.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+/*
+ * The maximum number of pages to writeout in a single bdflush/kupdate
+ * operation.  We do this so we don't hold I_LOCK against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES	1024
+
+/*
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling.
+ */
+static long ratelimit_pages = 32;
+
+static long total_pages;	/* The total number of pages in the machine. */
+static int dirty_exceeded;	/* Dirty mem may be over limit */
+
+/*
+ * When balance_dirty_pages decides that the caller needs to perform some
+ * non-background writeback, this is how many pages it will attempt to write.
+ * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * large amounts of I/O are submitted.
+ */
+static inline long sync_writeback_pages(void)
+{
+	return ratelimit_pages + ratelimit_pages / 2;
+}
+
+/* The following parameters are exported via /proc/sys/vm */
+
+/*
+ * Start background writeback (via pdflush) at this percentage
+ */
+int dirty_background_ratio = 10;
+
+/*
+ * The generator of dirty data starts writeback at this percentage
+ */
+int vm_dirty_ratio = 40;
+
+/*
+ * The interval between `kupdate'-style writebacks, in centiseconds
+ * (hundredths of a second)
+ */
+int dirty_writeback_centisecs = 5 * 100;
+
+/*
+ * The longest number of centiseconds for which data is allowed to remain dirty
+ */
+int dirty_expire_centisecs = 30 * 100;
+
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode".
+ */
+int laptop_mode;
+
+EXPORT_SYMBOL(laptop_mode);
+
+/* End of sysctl-exported parameters */
+
+
+static void background_writeout(unsigned long _min_pages);
+
+struct writeback_state
+{
+	unsigned long nr_dirty;
+	unsigned long nr_unstable;
+	unsigned long nr_mapped;
+	unsigned long nr_writeback;
+};
+
+static void get_writeback_state(struct writeback_state *wbs)
+{
+	wbs->nr_dirty = read_page_state(nr_dirty);
+	wbs->nr_unstable = read_page_state(nr_unstable);
+	wbs->nr_mapped = read_page_state(nr_mapped);
+	wbs->nr_writeback = read_page_state(nr_writeback);
+}
+
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around.  To avoid stressing page reclaim with lots of unreclaimable
+ * pages.  It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+static void
+get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
+		struct address_space *mapping)
+{
+	int background_ratio;		/* Percentages */
+	int dirty_ratio;
+	int unmapped_ratio;
+	long background;
+	long dirty;
+	unsigned long available_memory = total_pages;
+	struct task_struct *tsk;
+
+	get_writeback_state(wbs);
+
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * If this mapping can only allocate from low memory,
+	 * we exclude high memory from our count.
+	 */
+	if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+		available_memory -= totalhigh_pages;
+#endif
+
+
+	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+
+	dirty_ratio = vm_dirty_ratio;
+	if (dirty_ratio > unmapped_ratio / 2)
+		dirty_ratio = unmapped_ratio / 2;
+
+	if (dirty_ratio < 5)
+		dirty_ratio = 5;
+
+	background_ratio = dirty_background_ratio;
+	if (background_ratio >= dirty_ratio)
+		background_ratio = dirty_ratio / 2;
+
+	background = (background_ratio * available_memory) / 100;
+	dirty = (dirty_ratio * available_memory) / 100;
+	tsk = current;
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+		background += background / 4;
+		dirty += dirty / 4;
+	}
+	*pbackground = background;
+	*pdirty = dirty;
+}
+
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * If we're over `background_thresh' then pdflush is woken to perform some
+ * writeout.
+ */
+static void balance_dirty_pages(struct address_space *mapping)
+{
+	struct writeback_state wbs;
+	long nr_reclaimable;
+	long background_thresh;
+	long dirty_thresh;
+	unsigned long pages_written = 0;
+	unsigned long write_chunk = sync_writeback_pages();
+
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	for (;;) {
+		struct writeback_control wbc = {
+			.bdi		= bdi,
+			.sync_mode	= WB_SYNC_NONE,
+			.older_than_this = NULL,
+			.nr_to_write	= write_chunk,
+		};
+
+		get_dirty_limits(&wbs, &background_thresh,
+					&dirty_thresh, mapping);
+		nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+			break;
+
+		dirty_exceeded = 1;
+
+		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+		 * Unstable writes are a feature of certain networked
+		 * filesystems (i.e. NFS) in which data may have been
+		 * written to the server's write cache, but has not yet
+		 * been flushed to permanent storage.
+		 */
+		if (nr_reclaimable) {
+			writeback_inodes(&wbc);
+			get_dirty_limits(&wbs, &background_thresh,
+					&dirty_thresh, mapping);
+			nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+			if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+				break;
+			pages_written += write_chunk - wbc.nr_to_write;
+			if (pages_written >= write_chunk)
+				break;		/* We've done our duty */
+		}
+		blk_congestion_wait(WRITE, HZ/10);
+	}
+
+	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+		dirty_exceeded = 0;
+
+	if (writeback_in_progress(bdi))
+		return;		/* pdflush is already working this queue */
+
+	/*
+	 * In laptop mode, we wait until hitting the higher threshold before
+	 * starting background writeout, and then write out all the way down
+	 * to the lower threshold.  So slow writers cause minimal disk activity.
+	 *
+	 * In normal mode, we start background writeout at the lower
+	 * background_thresh, to keep the amount of dirty memory low.
+	 */
+	if ((laptop_mode && pages_written) ||
+	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+		pdflush_operation(background_writeout, 0);
+}
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state
+ * @mapping - address_space which was dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied.  The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting).  But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+	static DEFINE_PER_CPU(int, ratelimits) = 0;
+	long ratelimit;
+
+	ratelimit = ratelimit_pages;
+	if (dirty_exceeded)
+		ratelimit = 8;
+
+	/*
+	 * Check the rate limiting. Also, we do not want to throttle real-time
+	 * tasks in balance_dirty_pages(). Period.
+	 */
+	if (get_cpu_var(ratelimits)++ >= ratelimit) {
+		__get_cpu_var(ratelimits) = 0;
+		put_cpu_var(ratelimits);
+		balance_dirty_pages(mapping);
+		return;
+	}
+	put_cpu_var(ratelimits);
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+
+void throttle_vm_writeout(void)
+{
+	struct writeback_state wbs;
+	long background_thresh;
+	long dirty_thresh;
+
+        for ( ; ; ) {
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+
+                /*
+                 * Boost the allowable dirty threshold a bit for page
+                 * allocators so they don't get DoS'ed by heavy writers
+                 */
+                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
+
+                if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
+                        break;
+                blk_congestion_wait(WRITE, HZ/10);
+        }
+}
+
+
+/*
+ * writeback at least _min_pages, and keep writing until the amount of dirty
+ * memory is less than the background threshold, or until we're all clean.
+ */
+static void background_writeout(unsigned long _min_pages)
+{
+	long min_pages = _min_pages;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 0,
+		.nonblocking	= 1,
+	};
+
+	for ( ; ; ) {
+		struct writeback_state wbs;
+		long background_thresh;
+		long dirty_thresh;
+
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+		if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
+				&& min_pages <= 0)
+			break;
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
+		writeback_inodes(&wbc);
+		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+			/* Wrote less than expected */
+			blk_congestion_wait(WRITE, HZ/10);
+			if (!wbc.encountered_congestion)
+				break;
+		}
+	}
+}
+
+/*
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
+ * -1 if all pdflush threads were busy.
+ */
+int wakeup_bdflush(long nr_pages)
+{
+	if (nr_pages == 0) {
+		struct writeback_state wbs;
+
+		get_writeback_state(&wbs);
+		nr_pages = wbs.nr_dirty + wbs.nr_unstable;
+	}
+	return pdflush_operation(background_writeout, nr_pages);
+}
+
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static struct timer_list wb_timer =
+			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
+static struct timer_list laptop_mode_wb_timer =
+			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
+
+/*
+ * Periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_centisecs.  But if a writeback event
+ * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static void wb_kupdate(unsigned long arg)
+{
+	unsigned long oldest_jif;
+	unsigned long start_jif;
+	unsigned long next_jif;
+	long nr_to_write;
+	struct writeback_state wbs;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = &oldest_jif,
+		.nr_to_write	= 0,
+		.nonblocking	= 1,
+		.for_kupdate	= 1,
+	};
+
+	sync_supers();
+
+	get_writeback_state(&wbs);
+	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	start_jif = jiffies;
+	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	while (nr_to_write > 0) {
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		writeback_inodes(&wbc);
+		if (wbc.nr_to_write > 0) {
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ/10);
+			else
+				break;	/* All the old data is written */
+		}
+		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	}
+	if (time_before(next_jif, jiffies + HZ))
+		next_jif = jiffies + HZ;
+	if (dirty_writeback_centisecs)
+		mod_timer(&wb_timer, next_jif);
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
+ */
+int dirty_writeback_centisecs_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_centisecs) {
+		mod_timer(&wb_timer,
+			jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	} else {
+		del_timer(&wb_timer);
+	}
+	return 0;
+}
+
+static void wb_timer_fn(unsigned long unused)
+{
+	if (pdflush_operation(wb_kupdate, 0) < 0)
+		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+	sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+	pdflush_operation(laptop_flush, 0);
+}
+
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now.  If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+	del_timer(&laptop_mode_wb_timer);
+}
+
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive)
+ * get_writeback_state too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high.  Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time.  So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+static void set_ratelimit(void)
+{
+	ratelimit_pages = total_pages / (num_online_cpus() * 32);
+	if (ratelimit_pages < 16)
+		ratelimit_pages = 16;
+	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+	set_ratelimit();
+	return 0;
+}
+
+static struct notifier_block ratelimit_nb = {
+	.notifier_call	= ratelimit_handler,
+	.next		= NULL,
+};
+
+/*
+ * If the machine has a large highmem:lowmem ratio then scale back the default
+ * dirty memory thresholds: allowing too much dirty highmem pins an excessive
+ * number of buffer_heads.
+ */
+void __init page_writeback_init(void)
+{
+	long buffer_pages = nr_free_buffer_pages();
+	long correction;
+
+	total_pages = nr_free_pagecache_pages();
+
+	correction = (100 * 4 * buffer_pages) / total_pages;
+
+	if (correction < 100) {
+		dirty_background_ratio *= correction;
+		dirty_background_ratio /= 100;
+		vm_dirty_ratio *= correction;
+		vm_dirty_ratio /= 100;
+
+		if (dirty_background_ratio <= 0)
+			dirty_background_ratio = 1;
+		if (vm_dirty_ratio <= 0)
+			vm_dirty_ratio = 1;
+	}
+	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	set_ratelimit();
+	register_cpu_notifier(&ratelimit_nb);
+}
+
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	if (wbc->nr_to_write <= 0)
+		return 0;
+	if (mapping->a_ops->writepages)
+		return mapping->a_ops->writepages(mapping, wbc);
+	return generic_writepages(mapping, wbc);
+}
+
+/**
+ * write_one_page - write out a single page and optionally wait on I/O
+ *
+ * @page - the page to write
+ * @wait - if true, wait on writeout
+ *
+ * The page must be locked by the caller and will be unlocked upon return.
+ *
+ * write_one_page() returns a negative error code if I/O failed.
+ */
+int write_one_page(struct page *page, int wait)
+{
+	struct address_space *mapping = page->mapping;
+	int ret = 0;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 1,
+	};
+
+	BUG_ON(!PageLocked(page));
+
+	if (wait)
+		wait_on_page_writeback(page);
+
+	if (clear_page_dirty_for_io(page)) {
+		page_cache_get(page);
+		ret = mapping->a_ops->writepage(page, &wbc);
+		if (ret == 0 && wait) {
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		page_cache_release(page);
+	} else {
+		unlock_page(page);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(write_one_page);
+
+/*
+ * For address_spaces which do not use buffers.  Just tag the page as dirty in
+ * its radix tree.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers.  This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * Most callers have locked the page, which pins the address_space in memory.
+ * But zap_pte_range() does not lock the page, however in that case the
+ * mapping is pinned by the vma's ->vm_file reference.
+ *
+ * We take care to handle the case where the page was truncated from the
+ * mapping by re-checking page_mapping() insode tree_lock.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+	int ret = 0;
+
+	if (!TestSetPageDirty(page)) {
+		struct address_space *mapping = page_mapping(page);
+		struct address_space *mapping2;
+
+		if (mapping) {
+			write_lock_irq(&mapping->tree_lock);
+			mapping2 = page_mapping(page);
+			if (mapping2) { /* Race with truncate? */
+				BUG_ON(mapping2 != mapping);
+				if (mapping_cap_account_dirty(mapping))
+					inc_page_state(nr_dirty);
+				radix_tree_tag_set(&mapping->page_tree,
+					page_index(page), PAGECACHE_TAG_DIRTY);
+			}
+			write_unlock_irq(&mapping->tree_lock);
+			if (mapping->host) {
+				/* !PageAnon && !swapper_space */
+				__mark_inode_dirty(mapping->host,
+							I_DIRTY_PAGES);
+			}
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+/*
+ * When a writepage implementation decides that it doesn't want to write this
+ * page for some reason, it should redirty the locked page via
+ * redirty_page_for_writepage() and it should then unlock the page and return 0
+ */
+int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+{
+	wbc->pages_skipped++;
+	return __set_page_dirty_nobuffers(page);
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int fastcall set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (likely(mapping)) {
+		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+		if (spd)
+			return (*spd)(page);
+		return __set_page_dirty_buffers(page);
+	}
+	if (!PageDirty(page))
+		SetPageDirty(page);
+	return 0;
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
+ * set_page_dirty() is racy if the caller has no reference against
+ * page->mapping->host, and if the page is unlocked.  This is because another
+ * CPU could truncate the page off the mapping and then free the mapping.
+ *
+ * Usually, the page _is_ locked, or the caller is a user-space process which
+ * holds a reference on the inode by having an open file.
+ *
+ * In other cases, the page should be locked before running set_page_dirty().
+ */
+int set_page_dirty_lock(struct page *page)
+{
+	int ret;
+
+	lock_page(page);
+	ret = set_page_dirty(page);
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_lock);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting. 
+ * Returns true if the page was previously dirty.
+ */
+int test_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	unsigned long flags;
+
+	if (mapping) {
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+			write_unlock_irqrestore(&mapping->tree_lock, flags);
+			if (mapping_cap_account_dirty(mapping))
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(test_clear_page_dirty);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout.  We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping) {
+		if (TestClearPageDirty(page)) {
+			if (mapping_cap_account_dirty(mapping))
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+int test_clear_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestClearPageWriteback(page);
+		if (ret)
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_WRITEBACK);
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestClearPageWriteback(page);
+	}
+	return ret;
+}
+
+int test_set_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestSetPageWriteback(page);
+		if (!ret)
+			radix_tree_tag_set(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_WRITEBACK);
+		if (!PageDirty(page))
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestSetPageWriteback(page);
+	}
+	return ret;
+
+}
+EXPORT_SYMBOL(test_set_page_writeback);
+
+/*
+ * Return true if any of the pages in the mapping are marged with the
+ * passed tag.
+ */
+int mapping_tagged(struct address_space *mapping, int tag)
+{
+	unsigned long flags;
+	int ret;
+
+	read_lock_irqsave(&mapping->tree_lock, flags);
+	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
new file mode 100644
index 000000000000..c73dbbc1cd8f
--- /dev/null
+++ b/mm/page_alloc.c
@@ -0,0 +1,2220 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/nodemask.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+/*
+ * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * initializer cleaner
+ */
+nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_possible_map = NODE_MASK_ALL;
+struct pglist_data *pgdat_list;
+unsigned long totalram_pages;
+unsigned long totalhigh_pages;
+long nr_swap_pages;
+
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ *	1G machine -> (16M dma, 784M normal, 224M high)
+ *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+
+EXPORT_SYMBOL(totalram_pages);
+EXPORT_SYMBOL(nr_swap_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+unsigned long __initdata nr_kernel_pages;
+unsigned long __initdata nr_all_pages;
+
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+		return 1;
+	if (page_to_pfn(page) < zone->zone_start_pfn)
+		return 1;
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 1;
+#endif
+	if (zone != page_zone(page))
+		return 1;
+	return 0;
+}
+
+static void bad_page(const char *function, struct page *page)
+{
+	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+		function, current->comm, page);
+	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+		(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+		page->mapping, page_mapcount(page), page_count(page));
+	printk(KERN_EMERG "Backtrace:\n");
+	dump_stack();
+	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
+	page->flags &= ~(1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_lru	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_swapcache |
+			1 << PG_writeback);
+	set_page_count(page, 0);
+	reset_page_mapcount(page);
+	page->mapping = NULL;
+	tainted |= TAINT_BAD_PAGE;
+}
+
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's ->index
+ * This is only for debug at present.  This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	page[1].mapping = NULL;
+	page[1].index = order;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		SetPageCompound(p);
+		p->private = (unsigned long)page;
+	}
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	if (!PageCompound(page))
+		return;
+
+	if (page[1].index != order)
+		bad_page(__FUNCTION__, page);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		if (!PageCompound(p))
+			bad_page(__FUNCTION__, page);
+		if (p->private != (unsigned long)page)
+			bad_page(__FUNCTION__, page);
+		ClearPageCompound(p);
+	}
+}
+#endif		/* CONFIG_HUGETLB_PAGE */
+
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page) {
+	return page->private;
+}
+
+static inline void set_page_order(struct page *page, int order) {
+	page->private = order;
+	__SetPagePrivate(page);
+}
+
+static inline void rmv_page_order(struct page *page)
+{
+	__ClearPagePrivate(page);
+	page->private = 0;
+}
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ *     B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ *     P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ */
+static inline struct page *
+__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+	unsigned long buddy_idx = page_idx ^ (1 << order);
+
+	return page + (buddy_idx - page_idx);
+}
+
+static inline unsigned long
+__find_combined_index(unsigned long page_idx, unsigned int order)
+{
+	return (page_idx & ~(1 << order));
+}
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can do coalesce a page and its buddy if
+ * (a) the buddy is free &&
+ * (b) the buddy is on the buddy system &&
+ * (c) a page and its buddy have the same order.
+ * for recording page's order, we use page->private and PG_private.
+ *
+ */
+static inline int page_is_buddy(struct page *page, int order)
+{
+       if (PagePrivate(page)           &&
+           (page_order(page) == order) &&
+           !PageReserved(page)         &&
+            page_count(page) == 0)
+               return 1;
+       return 0;
+}
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+ * free pages of length of (1 << order) and marked with PG_Private.Page's
+ * order is recorded in page->private field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static inline void __free_pages_bulk (struct page *page,
+		struct zone *zone, unsigned int order)
+{
+	unsigned long page_idx;
+	int order_size = 1 << order;
+
+	if (unlikely(order))
+		destroy_compound_page(page, order);
+
+	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+
+	BUG_ON(page_idx & (order_size - 1));
+	BUG_ON(bad_range(zone, page));
+
+	zone->free_pages += order_size;
+	while (order < MAX_ORDER-1) {
+		unsigned long combined_idx;
+		struct free_area *area;
+		struct page *buddy;
+
+		combined_idx = __find_combined_index(page_idx, order);
+		buddy = __page_find_buddy(page, page_idx, order);
+
+		if (bad_range(zone, buddy))
+			break;
+		if (!page_is_buddy(buddy, order))
+			break;		/* Move the buddy up one level. */
+		list_del(&buddy->lru);
+		area = zone->free_area + order;
+		area->nr_free--;
+		rmv_page_order(buddy);
+		page = page + (combined_idx - page_idx);
+		page_idx = combined_idx;
+		order++;
+	}
+	set_page_order(page, order);
+	list_add(&page->lru, &zone->free_area[order].free_list);
+	zone->free_area[order].nr_free++;
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+	if (	page_mapcount(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_swapcache |
+			1 << PG_writeback )))
+		bad_page(function, page);
+	if (PageDirty(page))
+		ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static int
+free_pages_bulk(struct zone *zone, int count,
+		struct list_head *list, unsigned int order)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+	int ret = 0;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
+	while (!list_empty(list) && count--) {
+		page = list_entry(list->prev, struct page, lru);
+		/* have to delete it as __free_pages_bulk list manipulates */
+		list_del(&page->lru);
+		__free_pages_bulk(page, zone, order);
+		ret++;
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+	LIST_HEAD(list);
+	int i;
+
+	arch_free_page(page, order);
+
+	mod_page_state(pgfree, 1 << order);
+
+#ifndef CONFIG_MMU
+	if (order > 0)
+		for (i = 1 ; i < (1 << order) ; ++i)
+			__put_page(page + i);
+#endif
+
+	for (i = 0 ; i < (1 << order) ; ++i)
+		free_pages_check(__FUNCTION__, page + i);
+	list_add(&page->lru, &list);
+	kernel_map_pages(page, 1<<order, 0);
+	free_pages_bulk(page_zone(page), 1, &list, order);
+}
+
+
+/*
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
+ *
+ * -- wli
+ */
+static inline struct page *
+expand(struct zone *zone, struct page *page,
+ 	int low, int high, struct free_area *area)
+{
+	unsigned long size = 1 << high;
+
+	while (high > low) {
+		area--;
+		high--;
+		size >>= 1;
+		BUG_ON(bad_range(zone, &page[size]));
+		list_add(&page[size].lru, &area->free_list);
+		area->nr_free++;
+		set_page_order(&page[size], high);
+	}
+	return page;
+}
+
+void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+	set_page_count(page, 1);
+#else
+	int i;
+
+	/*
+	 * We need to reference all the pages for this order, otherwise if
+	 * anyone accesses one of the pages with (get/put) it will be freed.
+	 * - eg: access_process_vm()
+	 */
+	for (i = 0; i < (1 << order); i++)
+		set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static void prep_new_page(struct page *page, int order)
+{
+	if (page->mapping || page_mapcount(page) ||
+	    (page->flags & (
+			1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_lru	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_reclaim	|
+			1 << PG_swapcache |
+			1 << PG_writeback )))
+		bad_page(__FUNCTION__, page);
+
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked | 1 << PG_mappedtodisk);
+	page->private = 0;
+	set_page_refs(page, order);
+	kernel_map_pages(page, 1 << order, 1);
+}
+
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+	struct free_area * area;
+	unsigned int current_order;
+	struct page *page;
+
+	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
+		if (list_empty(&area->free_list))
+			continue;
+
+		page = list_entry(area->free_list.next, struct page, lru);
+		list_del(&page->lru);
+		rmv_page_order(page);
+		area->nr_free--;
+		zone->free_pages -= 1UL << order;
+		return expand(zone, page, order, current_order, area);
+	}
+
+	return NULL;
+}
+
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+			unsigned long count, struct list_head *list)
+{
+	unsigned long flags;
+	int i;
+	int allocated = 0;
+	struct page *page;
+	
+	spin_lock_irqsave(&zone->lock, flags);
+	for (i = 0; i < count; ++i) {
+		page = __rmqueue(zone, order);
+		if (page == NULL)
+			break;
+		allocated++;
+		list_add_tail(&page->lru, list);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return allocated;
+}
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+	struct zone *zone;
+	int i;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+
+		pset = &zone->pageset[cpu];
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			pcp->count -= free_pages_bulk(zone, pcp->count,
+						&pcp->list, 0);
+		}
+	}
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+
+void mark_free_pages(struct zone *zone)
+{
+	unsigned long zone_pfn, flags;
+	int order;
+	struct list_head *curr;
+
+	if (!zone->spanned_pages)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
+
+	for (order = MAX_ORDER - 1; order >= 0; --order)
+		list_for_each(curr, &zone->free_area[order].free_list) {
+			unsigned long start_pfn, i;
+
+			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+
+			for (i=0; i < (1<<order); i++)
+				SetPageNosaveFree(pfn_to_page(start_pfn+i));
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);	
+	__drain_pages(smp_processor_id());
+	local_irq_restore(flags);	
+}
+#endif /* CONFIG_PM */
+
+static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+	unsigned long flags;
+	int cpu;
+	pg_data_t *pg = z->zone_pgdat;
+	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+	struct per_cpu_pageset *p;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	p = &z->pageset[cpu];
+	if (pg == orig) {
+		z->pageset[cpu].numa_hit++;
+	} else {
+		p->numa_miss++;
+		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+	}
+	if (pg == NODE_DATA(numa_node_id()))
+		p->local_node++;
+	else
+		p->other_node++;
+	local_irq_restore(flags);
+#endif
+}
+
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	arch_free_page(page, 0);
+
+	kernel_map_pages(page, 1, 0);
+	inc_page_state(pgfree);
+	if (PageAnon(page))
+		page->mapping = NULL;
+	free_pages_check(__FUNCTION__, page);
+	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	local_irq_save(flags);
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	list_add(&page->lru, &pcp->list);
+	pcp->count++;
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+void fastcall free_hot_page(struct page *page)
+{
+	free_hot_cold_page(page, 0);
+}
+	
+void fastcall free_cold_page(struct page *page)
+{
+	free_hot_cold_page(page, 1);
+}
+
+static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	for(i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+	int cold = !!(gfp_flags & __GFP_COLD);
+
+	if (order == 0) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		local_irq_save(flags);
+		if (pcp->count <= pcp->low)
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+		if (pcp->count) {
+			page = list_entry(pcp->list.next, struct page, lru);
+			list_del(&page->lru);
+			pcp->count--;
+		}
+		local_irq_restore(flags);
+		put_cpu();
+	}
+
+	if (page == NULL) {
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+
+	if (page != NULL) {
+		BUG_ON(bad_range(zone, page));
+		mod_page_state_zone(zone, pgalloc, 1 << order);
+		prep_new_page(page, order);
+
+		if (gfp_flags & __GFP_ZERO)
+			prep_zero_page(page, order, gfp_flags);
+
+		if (order && (gfp_flags & __GFP_COMP))
+			prep_compound_page(page, order);
+	}
+	return page;
+}
+
+/*
+ * Return 1 if free pages are above 'mark'. This takes into account the order
+ * of the allocation.
+ */
+int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int can_try_harder, int gfp_high)
+{
+	/* free_pages my go negative - that's OK */
+	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+	int o;
+
+	if (gfp_high)
+		min -= min / 2;
+	if (can_try_harder)
+		min -= min / 4;
+
+	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+		return 0;
+	for (o = 0; o < order; o++) {
+		/* At the next order, this order's pages become unavailable */
+		free_pages -= z->free_area[o].nr_free << o;
+
+		/* Require fewer higher order pages to be free */
+		min >>= 1;
+
+		if (free_pages <= min)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page * fastcall
+__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	const int wait = gfp_mask & __GFP_WAIT;
+	struct zone **zones, *z;
+	struct page *page;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
+	int i;
+	int classzone_idx;
+	int do_retry;
+	int can_try_harder;
+	int did_some_progress;
+
+	might_sleep_if(wait);
+
+	/*
+	 * The caller may dip into page reserves a bit more if the caller
+	 * cannot run direct reclaim, or is the caller has realtime scheduling
+	 * policy
+	 */
+	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+
+	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+
+	if (unlikely(zones[0] == NULL)) {
+		/* Should this ever happen?? */
+		return NULL;
+	}
+
+	classzone_idx = zone_idx(zones[0]);
+
+ restart:
+	/* Go through the zonelist once, looking for a zone with enough free */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+
+		if (!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, 0, 0))
+			continue;
+
+		if (!cpuset_zone_allowed(z))
+			continue;
+
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
+	}
+
+	for (i = 0; (z = zones[i]) != NULL; i++)
+		wakeup_kswapd(z, order);
+
+	/*
+	 * Go through the zonelist again. Let __GFP_HIGH and allocations
+	 * coming from realtime tasks to go deeper into reserves
+	 *
+	 * This is the last chance, in general, before the goto nopage.
+	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		if (!zone_watermark_ok(z, order, z->pages_min,
+				       classzone_idx, can_try_harder,
+				       gfp_mask & __GFP_HIGH))
+			continue;
+
+		if (wait && !cpuset_zone_allowed(z))
+			continue;
+
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
+	}
+
+	/* This allocation should allow future memory freeing. */
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
+		/* go through the zonelist yet again, ignoring mins */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (!cpuset_zone_allowed(z))
+				continue;
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+		goto nopage;
+	}
+
+	/* Atomic allocations - we can't balance anything */
+	if (!wait)
+		goto nopage;
+
+rebalance:
+	cond_resched();
+
+	/* We now go into synchronous reclaim */
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (likely(did_some_progress)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (!zone_watermark_ok(z, order, z->pages_min,
+					       classzone_idx, can_try_harder,
+					       gfp_mask & __GFP_HIGH))
+				continue;
+
+			if (!cpuset_zone_allowed(z))
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (!zone_watermark_ok(z, order, z->pages_high,
+					       classzone_idx, 0, 0))
+				continue;
+
+			if (!cpuset_zone_allowed(z))
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		out_of_memory(gfp_mask);
+		goto restart;
+	}
+
+	/*
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * <= 3, but that may not be true in other implementations.
+	 */
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
+		goto rebalance;
+	}
+
+nopage:
+	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+		printk(KERN_WARNING "%s: page allocation failure."
+			" order:%d, mode:0x%x\n",
+			p->comm, order, gfp_mask);
+		dump_stack();
+	}
+	return NULL;
+got_pg:
+	zone_statistics(zonelist, z);
+	return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
+{
+	struct page * page;
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
+{
+	struct page * page;
+
+	/*
+	 * get_zeroed_page() returns a 32-bit address, which cannot represent
+	 * a highmem page
+	 */
+	BUG_ON(gfp_mask & __GFP_HIGHMEM);
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0)
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+	if (!PageReserved(page) && put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		BUG_ON(!virt_addr_valid((void *)addr));
+		__free_pages(virt_to_page((void *)addr), order);
+	}
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+	unsigned int sum = 0;
+	struct zone *zone;
+
+	for_each_zone(zone)
+		sum += zone->free_pages;
+
+	return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+	unsigned int i, sum = 0;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += pgdat->node_zones[i].free_pages;
+
+	return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+	pg_data_t *pgdat;
+	unsigned int sum = 0;
+
+	for_each_pgdat(pgdat) {
+		struct zonelist *zonelist = pgdat->node_zonelists + offset;
+		struct zone **zonep = zonelist->zones;
+		struct zone *zone;
+
+		for (zone = *zonep++; zone; zone = *zonep++) {
+			unsigned long size = zone->present_pages;
+			unsigned long high = zone->pages_high;
+			if (size > high)
+				sum += size - high;
+		}
+	}
+
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+	return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+	return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+	pg_data_t *pgdat;
+	unsigned int pages = 0;
+
+	for_each_pgdat(pgdat)
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+	return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+	printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone)	do { } while (0)
+#endif
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+static DEFINE_PER_CPU(struct page_state, page_states) = {0};
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+void __get_page_state(struct page_state *ret, int nr)
+{
+	int cpu = 0;
+
+	memset(ret, 0, sizeof(*ret));
+
+	cpu = first_cpu(cpu_online_map);
+	while (cpu < NR_CPUS) {
+		unsigned long *in, *out, off;
+
+		in = (unsigned long *)&per_cpu(page_states, cpu);
+
+		cpu = next_cpu(cpu, cpu_online_map);
+
+		if (cpu < NR_CPUS)
+			prefetch(&per_cpu(page_states, cpu));
+
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
+	}
+}
+
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
+unsigned long __read_page_state(unsigned offset)
+{
+	unsigned long ret = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		unsigned long in;
+
+		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+		ret += *((unsigned long *)in);
+	}
+	return ret;
+}
+
+void __mod_page_state(unsigned offset, unsigned long delta)
+{
+	unsigned long flags;
+	void* ptr;
+
+	local_irq_save(flags);
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long*)(ptr + offset) += delta;
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__mod_page_state);
+
+void __get_zone_counts(unsigned long *active, unsigned long *inactive,
+			unsigned long *free, struct pglist_data *pgdat)
+{
+	struct zone *zones = pgdat->node_zones;
+	int i;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		*active += zones[i].nr_active;
+		*inactive += zones[i].nr_inactive;
+		*free += zones[i].free_pages;
+	}
+}
+
+void get_zone_counts(unsigned long *active,
+		unsigned long *inactive, unsigned long *free)
+{
+	struct pglist_data *pgdat;
+
+	*active = 0;
+	*inactive = 0;
+	*free = 0;
+	for_each_pgdat(pgdat) {
+		unsigned long l, m, n;
+		__get_zone_counts(&l, &m, &n, pgdat);
+		*active += l;
+		*inactive += m;
+		*free += n;
+	}
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+	val->totalram = totalram_pages;
+	val->sharedram = 0;
+	val->freeram = nr_free_pages();
+	val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+	val->totalhigh = totalhigh_pages;
+	val->freehigh = nr_free_highpages();
+#else
+	val->totalhigh = 0;
+	val->freehigh = 0;
+#endif
+	val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	val->totalram = pgdat->node_present_pages;
+	val->freeram = nr_free_pages_pgdat(pgdat);
+	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+	val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+	struct page_state ps;
+	int cpu, temperature;
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		show_node(zone);
+		printk("%s per-cpu:", zone->name);
+
+		if (!zone->present_pages) {
+			printk(" empty\n");
+			continue;
+		} else
+			printk("\n");
+
+		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+			struct per_cpu_pageset *pageset;
+
+			if (!cpu_possible(cpu))
+				continue;
+
+			pageset = zone->pageset + cpu;
+
+			for (temperature = 0; temperature < 2; temperature++)
+				printk("cpu %d %s: low %d, high %d, batch %d\n",
+					cpu,
+					temperature ? "cold" : "hot",
+					pageset->pcp[temperature].low,
+					pageset->pcp[temperature].high,
+					pageset->pcp[temperature].batch);
+		}
+	}
+
+	get_page_state(&ps);
+	get_zone_counts(&active, &inactive, &free);
+
+	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+		K(nr_free_pages()),
+		K(nr_free_highpages()));
+
+	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+		active,
+		inactive,
+		ps.nr_dirty,
+		ps.nr_writeback,
+		ps.nr_unstable,
+		nr_free_pages(),
+		ps.nr_slab,
+		ps.nr_mapped,
+		ps.nr_page_table_pages);
+
+	for_each_zone(zone) {
+		int i;
+
+		show_node(zone);
+		printk("%s"
+			" free:%lukB"
+			" min:%lukB"
+			" low:%lukB"
+			" high:%lukB"
+			" active:%lukB"
+			" inactive:%lukB"
+			" present:%lukB"
+			" pages_scanned:%lu"
+			" all_unreclaimable? %s"
+			"\n",
+			zone->name,
+			K(zone->free_pages),
+			K(zone->pages_min),
+			K(zone->pages_low),
+			K(zone->pages_high),
+			K(zone->nr_active),
+			K(zone->nr_inactive),
+			K(zone->present_pages),
+			zone->pages_scanned,
+			(zone->all_unreclaimable ? "yes" : "no")
+			);
+		printk("lowmem_reserve[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->lowmem_reserve[i]);
+		printk("\n");
+	}
+
+	for_each_zone(zone) {
+ 		unsigned long nr, flags, order, total = 0;
+
+		show_node(zone);
+		printk("%s: ", zone->name);
+		if (!zone->present_pages) {
+			printk("empty\n");
+			continue;
+		}
+
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++) {
+			nr = zone->free_area[order].nr_free;
+			total += nr << order;
+			printk("%lu*%lukB ", nr, K(1UL) << order);
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+		printk("= %lukB\n", K(total));
+	}
+
+	show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+{
+	switch (k) {
+		struct zone *zone;
+	default:
+		BUG();
+	case ZONE_HIGHMEM:
+		zone = pgdat->node_zones + ZONE_HIGHMEM;
+		if (zone->present_pages) {
+#ifndef CONFIG_HIGHMEM
+			BUG();
+#endif
+			zonelist->zones[j++] = zone;
+		}
+	case ZONE_NORMAL:
+		zone = pgdat->node_zones + ZONE_NORMAL;
+		if (zone->present_pages)
+			zonelist->zones[j++] = zone;
+	case ZONE_DMA:
+		zone = pgdat->node_zones + ZONE_DMA;
+		if (zone->present_pages)
+			zonelist->zones[j++] = zone;
+	}
+
+	return j;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (num_online_nodes())
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given
+ *    node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	int i, n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for_each_online_node(i) {
+		cpumask_t tmp;
+
+		/* Start from local node */
+		n = (node+i) % num_online_nodes();
+
+		/* Don't want a node to appear more than once */
+		if (node_isset(n, *used_node_mask))
+			continue;
+
+		/* Use the local node if we haven't already */
+		if (!node_isset(node, *used_node_mask)) {
+			best_node = node;
+			break;
+		}
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Give preference to headless and unused nodes */
+		tmp = node_to_cpumask(n);
+		if (!cpus_empty(tmp))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		node_set(best_node, *used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	nodemask_t used_mask;
+
+	/* initialize zonelists */
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+		if (node_distance(local_node, node) !=
+				node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < GFP_ZONETYPES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = ZONE_NORMAL;
+			if (i & __GFP_HIGHMEM)
+				k = ZONE_HIGHMEM;
+			if (i & __GFP_DMA)
+				k = ZONE_DMA;
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	for (i = 0; i < GFP_ZONETYPES; i++) {
+		struct zonelist *zonelist;
+
+		zonelist = pgdat->node_zonelists + i;
+
+		j = 0;
+		k = ZONE_NORMAL;
+		if (i & __GFP_HIGHMEM)
+			k = ZONE_HIGHMEM;
+		if (i & __GFP_DMA)
+			k = ZONE_DMA;
+
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+		for (node = 0; node < local_node; node++) {
+			if (!node_online(node))
+				continue;
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+		}
+
+		zonelist->zones[j] = NULL;
+	}
+}
+
+#endif	/* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		build_zonelists(NODE_DATA(i));
+	printk("Built %i zonelists\n", num_online_nodes());
+	cpuset_init_current_mems_allowed();
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE	256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+	unsigned long size = 1;
+
+	pages /= PAGES_PER_WAITQUEUE;
+
+	while (size < pages)
+		size <<= 1;
+
+	/*
+	 * Once we have dozens or even hundreds of threads sleeping
+	 * on IO we've got bigger problems than wait queue collision.
+	 * Limit the size of the wait table to a reasonable size.
+	 */
+	size = min(size, 4096UL);
+
+	return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+	return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		totalpages += zones_size[i];
+	pgdat->node_spanned_pages = totalpages;
+
+	realtotalpages = totalpages;
+	if (zholes_size)
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			realtotalpages -= zholes_size[i];
+	pgdat->node_present_pages = realtotalpages;
+	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+		unsigned long start_pfn)
+{
+	struct page *start = pfn_to_page(start_pfn);
+	struct page *page;
+
+	for (page = start; page < (start + size); page++) {
+		set_page_zone(page, NODEZONE(nid, zone));
+		set_page_count(page, 0);
+		reset_page_mapcount(page);
+		SetPageReserved(page);
+		INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
+		if (!is_highmem_idx(zone))
+			set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+#endif
+		start_pfn++;
+	}
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long size)
+{
+	int order;
+	for (order = 0; order < MAX_ORDER ; order++) {
+		INIT_LIST_HEAD(&zone->free_area[order].free_list);
+		zone->free_area[order].nr_free = 0;
+	}
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(size, nid, zone, start_pfn) \
+	memmap_init_zone((size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
+{
+	unsigned long i, j;
+	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+	int cpu, nid = pgdat->node_id;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+	pgdat->nr_zones = 0;
+	init_waitqueue_head(&pgdat->kswapd_wait);
+	pgdat->kswapd_max_order = 0;
+	
+	for (j = 0; j < MAX_NR_ZONES; j++) {
+		struct zone *zone = pgdat->node_zones + j;
+		unsigned long size, realsize;
+		unsigned long batch;
+
+		zone_table[NODEZONE(nid, j)] = zone;
+		realsize = size = zones_size[j];
+		if (zholes_size)
+			realsize -= zholes_size[j];
+
+		if (j == ZONE_DMA || j == ZONE_NORMAL)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
+		zone->name = zone_names[j];
+		spin_lock_init(&zone->lock);
+		spin_lock_init(&zone->lru_lock);
+		zone->zone_pgdat = pgdat;
+		zone->free_pages = 0;
+
+		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+		/*
+		 * The per-cpu-pages pools are set to around 1000th of the
+		 * size of the zone.  But no more than 1/4 of a meg - there's
+		 * no point in going beyond the size of L2 cache.
+		 *
+		 * OK, so we don't know how big the cache is.  So guess.
+		 */
+		batch = zone->present_pages / 1024;
+		if (batch * PAGE_SIZE > 256 * 1024)
+			batch = (256 * 1024) / PAGE_SIZE;
+		batch /= 4;		/* We effectively *= 4 below */
+		if (batch < 1)
+			batch = 1;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+				zone_names[j], realsize, batch);
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_list);
+		zone->nr_scan_active = 0;
+		zone->nr_scan_inactive = 0;
+		zone->nr_active = 0;
+		zone->nr_inactive = 0;
+		if (!size)
+			continue;
+
+		/*
+		 * The per-page waitqueue mechanism uses hashed waitqueues
+		 * per zone.
+		 */
+		zone->wait_table_size = wait_table_size(size);
+		zone->wait_table_bits =
+			wait_table_bits(zone->wait_table_size);
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, zone->wait_table_size
+						* sizeof(wait_queue_head_t));
+
+		for(i = 0; i < zone->wait_table_size; ++i)
+			init_waitqueue_head(zone->wait_table + i);
+
+		pgdat->nr_zones = j+1;
+
+		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+		zone->zone_start_pfn = zone_start_pfn;
+
+		if ((zone_start_pfn) & (zone_required_alignment-1))
+			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
+
+		memmap_init(size, nid, j, zone_start_pfn);
+
+		zone_start_pfn += size;
+
+		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+	}
+}
+
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+{
+	unsigned long size;
+
+	/* Skip empty nodes */
+	if (!pgdat->node_spanned_pages)
+		return;
+
+	/* ia64 gets its own node_mem_map, before this, without bootmem */
+	if (!pgdat->node_mem_map) {
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+	}
+#ifndef CONFIG_DISCONTIGMEM
+	/*
+	 * With no DISCONTIG, the global mem_map is just set as node 0's
+	 */
+	if (pgdat == NODE_DATA(0))
+		mem_map = NODE_DATA(0)->node_mem_map;
+#endif
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+		unsigned long *zones_size, unsigned long node_start_pfn,
+		unsigned long *zholes_size)
+{
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+
+	alloc_node_mem_map(pgdat);
+
+	free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+
+void __init free_area_init(unsigned long *zones_size)
+{
+	free_area_init_node(0, &contig_page_data, zones_size,
+			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+	pg_data_t *pgdat;
+	loff_t node = *pos;
+
+	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+		--node;
+
+	return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+
+	(*pos)++;
+	return pgdat->pgdat_next;
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* 
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = (pg_data_t *)arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+	int order;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!zone->present_pages)
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations fragmentation_op = {
+	.start	= frag_start,
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= frag_show,
+};
+
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_unstable",
+	"nr_page_table_pages",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+	"pgalloc_high",
+
+	"pgalloc_normal",
+	"pgalloc_dma",
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+
+	"pgfault",
+	"pgmajfault",
+	"pgrefill_high",
+	"pgrefill_normal",
+	"pgrefill_dma",
+
+	"pgsteal_high",
+	"pgsteal_normal",
+	"pgsteal_dma",
+	"pgscan_kswapd_high",
+	"pgscan_kswapd_normal",
+
+	"pgscan_kswapd_dma",
+	"pgscan_direct_high",
+	"pgscan_direct_normal",
+	"pgscan_direct_dma",
+	"pginodesteal",
+
+	"slabs_scanned",
+	"kswapd_steal",
+	"kswapd_inodesteal",
+	"pageoutrun",
+	"allocstall",
+
+	"pgrotated",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	ps->pgpgin /= 2;		/* sectors -> kbytes */
+	ps->pgpgout /= 2;
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	long *count;
+	unsigned long *src, *dest;
+
+	if (action == CPU_DEAD) {
+		int i;
+
+		/* Drain local pagecache count. */
+		count = &per_cpu(nr_pagecache_local, cpu);
+		atomic_add(*count, &nr_pagecache);
+		*count = 0;
+		local_irq_disable();
+		__drain_pages(cpu);
+
+		/* Add dead cpu's page_states to our own. */
+		dest = (unsigned long *)&__get_cpu_var(page_states);
+		src = (unsigned long *)&per_cpu(page_states, cpu);
+
+		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
+				i++) {
+			dest[i] += src[i];
+			src[i] = 0;
+		}
+
+		local_irq_enable();
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+	hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+/*
+ * setup_per_zone_lowmem_reserve - called whenever
+ *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *	has a correct pages reserved value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ */
+static void setup_per_zone_lowmem_reserve(void)
+{
+	struct pglist_data *pgdat;
+	int j, idx;
+
+	for_each_pgdat(pgdat) {
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = pgdat->node_zones + j;
+			unsigned long present_pages = zone->present_pages;
+
+			zone->lowmem_reserve[j] = 0;
+
+			for (idx = j-1; idx >= 0; idx--) {
+				struct zone *lower_zone;
+
+				if (sysctl_lowmem_reserve_ratio[idx] < 1)
+					sysctl_lowmem_reserve_ratio[idx] = 1;
+
+				lower_zone = pgdat->node_zones + idx;
+				lower_zone->lowmem_reserve[j] = present_pages /
+					sysctl_lowmem_reserve_ratio[idx];
+				present_pages += lower_zone->present_pages;
+			}
+		}
+	}
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+static void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (is_highmem(zone)) {
+			/*
+			 * Often, highmem doesn't need to reserve any pages.
+			 * But the pages_min/low/high values are also used for
+			 * batching up page reclaim activity so we need a
+			 * decent value here.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/* if it's a lowmem zone, reserve a number of pages 
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = (pages_min * zone->present_pages) / 
+			                   lowmem_pages;
+		}
+
+		/*
+		 * When interpreting these watermarks, just keep in mind that:
+		 * zone->pages_min == (zone->pages_min * 4) / 4;
+		 */
+		zone->pages_low   = (zone->pages_min * 5) / 4;
+		zone->pages_high  = (zone->pages_min * 6) / 4;
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min).  For large machines
+ * we want it large (64MB max).  But it is not linear, because network
+ * bandwidth does not increase linearly with machine size.  We use
+ *
+ * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
+ *
+ * which yields
+ *
+ * 16MB:	512k
+ * 32MB:	724k
+ * 64MB:	1024k
+ * 128MB:	1448k
+ * 256MB:	2048k
+ * 512MB:	2896k
+ * 1024MB:	4096k
+ * 2048MB:	5792k
+ * 4096MB:	8192k
+ * 8192MB:	11584k
+ * 16384MB:	16384k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+	unsigned long lowmem_kbytes;
+
+	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+	if (min_free_kbytes < 128)
+		min_free_kbytes = 128;
+	if (min_free_kbytes > 65536)
+		min_free_kbytes = 65536;
+	setup_per_zone_pages_min();
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call two helper functions whenever min_free_kbytes
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	setup_per_zone_pages_min();
+	return 0;
+}
+
+/*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	setup_per_zone_lowmem_reserve();
+	return 0;
+}
+
+__initdata int hashdist = HASHDIST_DEFAULT;
+
+#ifdef CONFIG_NUMA
+static int __init set_hashdist(char *str)
+{
+	if (!str)
+		return 0;
+	hashdist = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long limit)
+{
+	unsigned long long max = limit;
+	unsigned long log2qty, size;
+	void *table = NULL;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries) {
+		/* round applicable memory size up to nearest megabyte */
+		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+		numentries >>= 20 - PAGE_SHIFT;
+		numentries <<= 20 - PAGE_SHIFT;
+
+		/* limit to 1 bucket per 2^scale bytes of low memory */
+		if (scale > PAGE_SHIFT)
+			numentries >>= (scale - PAGE_SHIFT);
+		else
+			numentries <<= (PAGE_SHIFT - scale);
+	}
+	/* rounded up to nearest power of 2 in size */
+	numentries = 1UL << (long_log2(numentries) + 1);
+
+	/* limit allocation size to 1/16 total memory by default */
+	if (max == 0) {
+		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+		do_div(max, bucketsize);
+	}
+
+	if (numentries > max)
+		numentries = max;
+
+	log2qty = long_log2(numentries);
+
+	do {
+		size = bucketsize << log2qty;
+		if (flags & HASH_EARLY)
+			table = alloc_bootmem(size);
+		else if (hashdist)
+			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+		else {
+			unsigned long order;
+			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
+				;
+			table = (void*) __get_free_pages(GFP_ATOMIC, order);
+		}
+	} while (!table && size > PAGE_SIZE && --log2qty);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+	       tablename,
+	       (1U << log2qty),
+	       long_log2(size) - PAGE_SHIFT,
+	       size);
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}
diff --git a/mm/page_io.c b/mm/page_io.c
new file mode 100644
index 000000000000..667c76df1ec2
--- /dev/null
+++ b/mm/page_io.c
@@ -0,0 +1,160 @@
+/*
+ *  linux/mm/page_io.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Swap reorganised 29.12.95, 
+ *  Asynchronous swapping added 30.12.95. Stephen Tweedie
+ *  Removed race in async swapping. 14.4.1996. Bruno Haible
+ *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <asm/pgtable.h>
+
+static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
+				struct page *page, bio_end_io_t end_io)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, 1);
+	if (bio) {
+		struct swap_info_struct *sis;
+		swp_entry_t entry = { .val = index, };
+
+		sis = get_swap_info_struct(swp_type(entry));
+		bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
+					(PAGE_SIZE >> 9);
+		bio->bi_bdev = sis->bdev;
+		bio->bi_io_vec[0].bv_page = page;
+		bio->bi_io_vec[0].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[0].bv_offset = 0;
+		bio->bi_vcnt = 1;
+		bio->bi_idx = 0;
+		bio->bi_size = PAGE_SIZE;
+		bio->bi_end_io = end_io;
+	}
+	return bio;
+}
+
+static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (!uptodate)
+		SetPageError(page);
+	end_page_writeback(page);
+	bio_put(bio);
+	return 0;
+}
+
+static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (!uptodate) {
+		SetPageError(page);
+		ClearPageUptodate(page);
+	} else {
+		SetPageUptodate(page);
+	}
+	unlock_page(page);
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
+int swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bio *bio;
+	int ret = 0, rw = WRITE;
+
+	if (remove_exclusive_swap_page(page)) {
+		unlock_page(page);
+		goto out;
+	}
+	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+	if (bio == NULL) {
+		set_page_dirty(page);
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		rw |= (1 << BIO_RW_SYNC);
+	inc_page_state(pswpout);
+	set_page_writeback(page);
+	unlock_page(page);
+	submit_bio(rw, bio);
+out:
+	return ret;
+}
+
+int swap_readpage(struct file *file, struct page *page)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	ClearPageUptodate(page);
+	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+	if (bio == NULL) {
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+	inc_page_state(pswpin);
+	submit_bio(READ, bio);
+out:
+	return ret;
+}
+
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
+/*
+ * A scruffy utility function to read or write an arbitrary swap page
+ * and wait on the I/O.  The caller must have a ref on the page.
+ *
+ * We use end_swap_bio_read() even for writes, because it happens to do what
+ * we want.
+ */
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	lock_page(page);
+
+	bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
+	if (bio == NULL) {
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	wait_on_page_locked(page);
+
+	if (!PageUptodate(page) || PageError(page))
+		ret = -EIO;
+out:
+	return ret;
+}
+#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
new file mode 100644
index 000000000000..38ce279cc8cd
--- /dev/null
+++ b/mm/pdflush.c
@@ -0,0 +1,228 @@
+/*
+ * mm/pdflush.c - worker threads for writing back filesystem data
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version
+ * 29Feb2004	kaos@sgi.com
+ *		Move worker thread creation to kthread to avoid chewing
+ *		up stack space with nested calls to kernel_thread.
+ */
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>		// Needed by writeback.h
+#include <linux/writeback.h>	// Prototypes pdflush_operation()
+#include <linux/kthread.h>
+
+
+/*
+ * Minimum and maximum number of pdflush instances
+ */
+#define MIN_PDFLUSH_THREADS	2
+#define MAX_PDFLUSH_THREADS	8
+
+static void start_one_pdflush_thread(void);
+
+
+/*
+ * The pdflush threads are worker threads for writing back dirty data.
+ * Ideally, we'd like one thread per active disk spindle.  But the disk
+ * topology is very hard to divine at this level.   Instead, we take
+ * care in various places to prevent more than one pdflush thread from
+ * performing writeback against a single filesystem.  pdflush threads
+ * have the PF_FLUSHER flag set in current->flags to aid in this.
+ */
+
+/*
+ * All the pdflush threads.  Protected by pdflush_lock
+ */
+static LIST_HEAD(pdflush_list);
+static DEFINE_SPINLOCK(pdflush_lock);
+
+/*
+ * The count of currently-running pdflush threads.  Protected
+ * by pdflush_lock.
+ *
+ * Readable by sysctl, but not writable.  Published to userspace at
+ * /proc/sys/vm/nr_pdflush_threads.
+ */
+int nr_pdflush_threads = 0;
+
+/*
+ * The time at which the pdflush thread pool last went empty
+ */
+static unsigned long last_empty_jifs;
+
+/*
+ * The pdflush thread.
+ *
+ * Thread pool management algorithm:
+ * 
+ * - The minimum and maximum number of pdflush instances are bound
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ * 
+ * - If there have been no idle pdflush instances for 1 second, create
+ *   a new one.
+ * 
+ * - If the least-recently-went-to-sleep pdflush thread has been asleep
+ *   for more than one second, terminate a thread.
+ */
+
+/*
+ * A structure for passing work to a pdflush thread.  Also for passing
+ * state information between pdflush threads.  Protected by pdflush_lock.
+ */
+struct pdflush_work {
+	struct task_struct *who;	/* The thread */
+	void (*fn)(unsigned long);	/* A callback function */
+	unsigned long arg0;		/* An argument to the callback */
+	struct list_head list;		/* On pdflush_list, when idle */
+	unsigned long when_i_went_to_sleep;
+};
+
+static int __pdflush(struct pdflush_work *my_work)
+{
+	current->flags |= PF_FLUSHER;
+	my_work->fn = NULL;
+	my_work->who = current;
+	INIT_LIST_HEAD(&my_work->list);
+
+	spin_lock_irq(&pdflush_lock);
+	nr_pdflush_threads++;
+	for ( ; ; ) {
+		struct pdflush_work *pdf;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		list_move(&my_work->list, &pdflush_list);
+		my_work->when_i_went_to_sleep = jiffies;
+		spin_unlock_irq(&pdflush_lock);
+
+		schedule();
+		if (try_to_freeze(PF_FREEZE)) {
+			spin_lock_irq(&pdflush_lock);
+			continue;
+		}
+
+		spin_lock_irq(&pdflush_lock);
+		if (!list_empty(&my_work->list)) {
+			printk("pdflush: bogus wakeup!\n");
+			my_work->fn = NULL;
+			continue;
+		}
+		if (my_work->fn == NULL) {
+			printk("pdflush: NULL work function\n");
+			continue;
+		}
+		spin_unlock_irq(&pdflush_lock);
+
+		(*my_work->fn)(my_work->arg0);
+
+		/*
+		 * Thread creation: For how long have there been zero
+		 * available threads?
+		 */
+		if (jiffies - last_empty_jifs > 1 * HZ) {
+			/* unlocked list_empty() test is OK here */
+			if (list_empty(&pdflush_list)) {
+				/* unlocked test is OK here */
+				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
+					start_one_pdflush_thread();
+			}
+		}
+
+		spin_lock_irq(&pdflush_lock);
+		my_work->fn = NULL;
+
+		/*
+		 * Thread destruction: For how long has the sleepiest
+		 * thread slept?
+		 */
+		if (list_empty(&pdflush_list))
+			continue;
+		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+			continue;
+		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
+		if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+			/* Limit exit rate */
+			pdf->when_i_went_to_sleep = jiffies;
+			break;					/* exeunt */
+		}
+	}
+	nr_pdflush_threads--;
+	spin_unlock_irq(&pdflush_lock);
+	return 0;
+}
+
+/*
+ * Of course, my_work wants to be just a local in __pdflush().  It is
+ * separated out in this manner to hopefully prevent the compiler from
+ * performing unfortunate optimisations against the auto variables.  Because
+ * these are visible to other tasks and CPUs.  (No problem has actually
+ * been observed.  This is just paranoia).
+ */
+static int pdflush(void *dummy)
+{
+	struct pdflush_work my_work;
+
+	/*
+	 * pdflush can spend a lot of time doing encryption via dm-crypt.  We
+	 * don't want to do that at keventd's priority.
+	 */
+	set_user_nice(current, 0);
+	return __pdflush(&my_work);
+}
+
+/*
+ * Attempt to wake up a pdflush thread, and get it to do some work for you.
+ * Returns zero if it indeed managed to find a worker thread, and passed your
+ * payload to it.
+ */
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (fn == NULL)
+		BUG();		/* Hard to diagnose if it's deferred */
+
+	spin_lock_irqsave(&pdflush_lock, flags);
+	if (list_empty(&pdflush_list)) {
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+		ret = -1;
+	} else {
+		struct pdflush_work *pdf;
+
+		pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
+		list_del_init(&pdf->list);
+		if (list_empty(&pdflush_list))
+			last_empty_jifs = jiffies;
+		pdf->fn = fn;
+		pdf->arg0 = arg0;
+		wake_up_process(pdf->who);
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+	}
+	return ret;
+}
+
+static void start_one_pdflush_thread(void)
+{
+	kthread_run(pdflush, NULL, "pdflush");
+}
+
+static int __init pdflush_init(void)
+{
+	int i;
+
+	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+		start_one_pdflush_thread();
+	return 0;
+}
+
+module_init(pdflush_init);
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
new file mode 100644
index 000000000000..b4e76c25f953
--- /dev/null
+++ b/mm/prio_tree.c
@@ -0,0 +1,207 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
+ *
+ * This file is released under the GPL v2.
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004	Initial version
+ */
+
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * See lib/prio_tree.c for details on the general radix priority search tree
+ * code.
+ */
+
+/*
+ * The following #defines are mirrored from lib/prio_tree.c. They're only used
+ * for debugging, and should be removed (along with the debugging code using
+ * them) when switching also VMAs to the regular prio_tree code.
+ */
+
+#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
+#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+/*
+ * Radix priority search tree for address_space->i_mmap
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponing priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set.head.
+ *
+ * prio_tree_root
+ *      |
+ *      A       vm_set.head
+ *     / \      /
+ *    L   R -> H-I-J-K-M-N-O-P-Q-S
+ *    ^   ^    <-- vm_set.list -->
+ *  tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.vm_set.parent is used for identifying
+ * tree nodes and list head nodes.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.vm_set.parent != NULL    ==> a tree node
+ *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
+ *      vma->shared.vm_set.head == NULL ==> no others map the same range
+ *
+ * vma->shared.vm_set.parent == NULL
+ * 	vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
+ * 	vma->shared.vm_set.head == NULL ==> a list node
+ */
+
+/*
+ * Add a new vma known to map the same set of pages as the old vma:
+ * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
+ * Note that it just happens to work correctly on i_mmap_nonlinear too.
+ */
+void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
+{
+	/* Leave these BUG_ONs till prio_tree patch stabilizes */
+	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+	vma->shared.vm_set.head = NULL;
+	vma->shared.vm_set.parent = NULL;
+
+	if (!old->shared.vm_set.parent)
+		list_add(&vma->shared.vm_set.list,
+				&old->shared.vm_set.list);
+	else if (old->shared.vm_set.head)
+		list_add_tail(&vma->shared.vm_set.list,
+				&old->shared.vm_set.head->shared.vm_set.list);
+	else {
+		INIT_LIST_HEAD(&vma->shared.vm_set.list);
+		vma->shared.vm_set.head = old;
+		old->shared.vm_set.head = vma;
+	}
+}
+
+void vma_prio_tree_insert(struct vm_area_struct *vma,
+			  struct prio_tree_root *root)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *old;
+
+	vma->shared.vm_set.head = NULL;
+
+	ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
+	if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
+		old = prio_tree_entry(ptr, struct vm_area_struct,
+					shared.prio_tree_node);
+		vma_prio_tree_add(vma, old);
+	}
+}
+
+void vma_prio_tree_remove(struct vm_area_struct *vma,
+			  struct prio_tree_root *root)
+{
+	struct vm_area_struct *node, *head, *new_head;
+
+	if (!vma->shared.vm_set.head) {
+		if (!vma->shared.vm_set.parent)
+			list_del_init(&vma->shared.vm_set.list);
+		else
+			raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
+	} else {
+		/* Leave this BUG_ON till prio_tree patch stabilizes */
+		BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
+		if (vma->shared.vm_set.parent) {
+			head = vma->shared.vm_set.head;
+			if (!list_empty(&head->shared.vm_set.list)) {
+				new_head = list_entry(
+					head->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&head->shared.vm_set.list);
+			} else
+				new_head = NULL;
+
+			raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
+					&head->shared.prio_tree_node);
+			head->shared.vm_set.head = new_head;
+			if (new_head)
+				new_head->shared.vm_set.head = head;
+
+		} else {
+			node = vma->shared.vm_set.head;
+			if (!list_empty(&vma->shared.vm_set.list)) {
+				new_head = list_entry(
+					vma->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&vma->shared.vm_set.list);
+				node->shared.vm_set.head = new_head;
+				new_head->shared.vm_set.head = node;
+			} else
+				node->shared.vm_set.head = NULL;
+		}
+	}
+}
+
+/*
+ * Helper function to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The function returns vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+					struct prio_tree_iter *iter)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *next;
+
+	if (!vma) {
+		/*
+		 * First call is with NULL vma
+		 */
+		ptr = prio_tree_next(iter);
+		if (ptr) {
+			next = prio_tree_entry(ptr, struct vm_area_struct,
+						shared.prio_tree_node);
+			prefetch(next->shared.vm_set.head);
+			return next;
+		} else
+			return NULL;
+	}
+
+	if (vma->shared.vm_set.parent) {
+		if (vma->shared.vm_set.head) {
+			next = vma->shared.vm_set.head;
+			prefetch(next->shared.vm_set.list.next);
+			return next;
+		}
+	} else {
+		next = list_entry(vma->shared.vm_set.list.next,
+				struct vm_area_struct, shared.vm_set.list);
+		if (!next->shared.vm_set.head) {
+			prefetch(next->shared.vm_set.list.next);
+			return next;
+		}
+	}
+
+	ptr = prio_tree_next(iter);
+	if (ptr) {
+		next = prio_tree_entry(ptr, struct vm_area_struct,
+					shared.prio_tree_node);
+		prefetch(next->shared.vm_set.head);
+		return next;
+	} else
+		return NULL;
+}
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 000000000000..b840e7c6ea74
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,557 @@
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+
+void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
+struct backing_dev_info default_backing_dev_info = {
+	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+	.state		= 0,
+	.capabilities	= BDI_CAP_MAP_COPY,
+	.unplug_io_fn	= default_unplug_io_fn,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+
+/*
+ * Initialise a struct file's readahead state.  Assumes that the caller has
+ * memset *ra to zero.
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+	ra->prev_page = -1;
+}
+
+/*
+ * Return max readahead size for this inode in number-of-pages.
+ */
+static inline unsigned long get_max_readahead(struct file_ra_state *ra)
+{
+	return ra->ra_pages;
+}
+
+static inline unsigned long get_min_readahead(struct file_ra_state *ra)
+{
+	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+}
+
+static inline void ra_off(struct file_ra_state *ra)
+{
+	ra->start = 0;
+	ra->flags = 0;
+	ra->size = 0;
+	ra->ahead_start = 0;
+	ra->ahead_size = 0;
+	return;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+	unsigned long newsize = roundup_pow_of_two(size);
+
+	if (newsize <= max / 64)
+		newsize = newsize * newsize;
+	else if (newsize <= max / 4)
+		newsize = max / 4;
+	else
+		newsize = max;
+	return newsize;
+}
+
+/*
+ * Set the new window size, this is called only when I/O is to be submitted,
+ * not for each call to readahead.  If a cache miss occured, reduce next I/O
+ * size, else increase depending on how close to max we are.
+ */
+static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
+{
+	unsigned long max = get_max_readahead(ra);
+	unsigned long min = get_min_readahead(ra);
+	unsigned long cur = ra->size;
+	unsigned long newsize;
+
+	if (ra->flags & RA_FLAG_MISS) {
+		ra->flags &= ~RA_FLAG_MISS;
+		newsize = max((cur - 2), min);
+	} else if (cur < max / 16) {
+		newsize = 4 * cur;
+	} else {
+		newsize = 2 * cur;
+	}
+	return min(newsize, max);
+}
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+/**
+ * read_cache_pages - populate an address space with some pages, and
+ * 			start reads against them.
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ * @filler: callback routine for filling a single page.
+ * @data: private data for the callback routine.
+ *
+ * Hides the details of the LRU cache etc from the filesystems.
+ */
+int read_cache_pages(struct address_space *mapping, struct list_head *pages,
+			int (*filler)(void *, struct page *), void *data)
+{
+	struct page *page;
+	struct pagevec lru_pvec;
+	int ret = 0;
+
+	pagevec_init(&lru_pvec, 0);
+
+	while (!list_empty(pages)) {
+		page = list_to_page(pages);
+		list_del(&page->lru);
+		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+			page_cache_release(page);
+			continue;
+		}
+		ret = filler(data, page);
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+		if (ret) {
+			while (!list_empty(pages)) {
+				struct page *victim;
+
+				victim = list_to_page(pages);
+				list_del(&victim->lru);
+				page_cache_release(victim);
+			}
+			break;
+		}
+	}
+	pagevec_lru_add(&lru_pvec);
+	return ret;
+}
+
+EXPORT_SYMBOL(read_cache_pages);
+
+static int read_pages(struct address_space *mapping, struct file *filp,
+		struct list_head *pages, unsigned nr_pages)
+{
+	unsigned page_idx;
+	struct pagevec lru_pvec;
+	int ret = 0;
+
+	if (mapping->a_ops->readpages) {
+		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+		goto out;
+	}
+
+	pagevec_init(&lru_pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_to_page(pages);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+			mapping->a_ops->readpage(filp, page);
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else {
+			page_cache_release(page);
+		}
+	}
+	pagevec_lru_add(&lru_pvec);
+out:
+	return ret;
+}
+
+/*
+ * Readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * start:	Page index at which we started the readahead
+ * size:	Number of pages in that read
+ *              Together, these form the "current window".
+ *              Together, start and size represent the `readahead window'.
+ * prev_page:   The page which the readahead algorithm most-recently inspected.
+ *              It is mainly used to detect sequential file reading.
+ *              If page_cache_readahead sees that it is again being called for
+ *              a page which it just looked at, it can return immediately without
+ *              making any state changes.
+ * ahead_start,
+ * ahead_size:  Together, these form the "ahead window".
+ * ra_pages:	The externally controlled max readahead for this fd.
+ *
+ * When readahead is in the off state (size == 0), readahead is disabled.
+ * In this state, prev_page is used to detect the resumption of sequential I/O.
+ *
+ * The readahead code manages two windows - the "current" and the "ahead"
+ * windows.  The intent is that while the application is walking the pages
+ * in the current window, I/O is underway on the ahead window.  When the
+ * current window is fully traversed, it is replaced by the ahead window
+ * and the ahead window is invalidated.  When this copying happens, the
+ * new current window's pages are probably still locked.  So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
+ *
+ * So:
+ *
+ *   ----|----------------|----------------|-----
+ *       ^start           ^start+size
+ *                        ^ahead_start     ^ahead_start+ahead_size
+ *
+ *         ^ When this page is read, we submit I/O for the
+ *           ahead window.
+ *
+ * A `readahead hit' occurs when a read request is made against a page which is
+ * the next sequential page. Ahead window calculations are done only when it
+ * is time to submit a new IO.  The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
+ *
+ * Any seek/ramdom IO will result in readahead being turned off.  It will resume
+ * at the first sequential access.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead.  It is called only once for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
+ */
+
+/*
+ * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * the pages first, then submits them all for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ *
+ * do_page_cache_readahead() returns -1 if it encountered request queue
+ * congestion.
+ */
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long end_index;	/* The last page we want to read */
+	LIST_HEAD(page_pool);
+	int page_idx;
+	int ret = 0;
+	loff_t isize = i_size_read(inode);
+
+	if (isize == 0)
+		goto out;
+
+ 	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	/*
+	 * Preallocate as many pages as we will need.
+	 */
+	read_lock_irq(&mapping->tree_lock);
+	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+		unsigned long page_offset = offset + page_idx;
+		
+		if (page_offset > end_index)
+			break;
+
+		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		if (page)
+			continue;
+
+		read_unlock_irq(&mapping->tree_lock);
+		page = page_cache_alloc_cold(mapping);
+		read_lock_irq(&mapping->tree_lock);
+		if (!page)
+			break;
+		page->index = page_offset;
+		list_add(&page->lru, &page_pool);
+		ret++;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+
+	/*
+	 * Now start the IO.  We ignore I/O errors - if the page is not
+	 * uptodate then the caller will launch readpage again, and
+	 * will then handle the error.
+	 */
+	if (ret)
+		read_pages(mapping, filp, &page_pool, ret);
+	BUG_ON(!list_empty(&page_pool));
+out:
+	return ret;
+}
+
+/*
+ * Chunk the readahead into 2 megabyte units, so that we don't pin too much
+ * memory at once.
+ */
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+		unsigned long offset, unsigned long nr_to_read)
+{
+	int ret = 0;
+
+	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
+		return -EINVAL;
+
+	while (nr_to_read) {
+		int err;
+
+		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+
+		if (this_chunk > nr_to_read)
+			this_chunk = nr_to_read;
+		err = __do_page_cache_readahead(mapping, filp,
+						offset, this_chunk);
+		if (err < 0) {
+			ret = err;
+			break;
+		}
+		ret += err;
+		offset += this_chunk;
+		nr_to_read -= this_chunk;
+	}
+	return ret;
+}
+
+/*
+ * Check how effective readahead is being.  If the amount of started IO is
+ * less than expected then the file is partly or fully in pagecache and
+ * readahead isn't helping.
+ *
+ */
+static inline int check_ra_success(struct file_ra_state *ra,
+			unsigned long nr_to_read, unsigned long actual)
+{
+	if (actual == 0) {
+		ra->cache_hit += nr_to_read;
+		if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
+			ra_off(ra);
+			ra->flags |= RA_FLAG_INCACHE;
+			return 0;
+		}
+	} else {
+		ra->cache_hit=0;
+	}
+	return 1;
+}
+
+/*
+ * This version skips the IO if the queue is read-congested, and will tell the
+ * block layer to abandon the readahead if request allocation would block.
+ *
+ * force_page_cache_readahead() will ignore queue congestion and will block on
+ * request queues.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	if (bdi_read_congested(mapping->backing_dev_info))
+		return -1;
+
+	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+}
+
+/*
+ * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
+ * is set wait till the read completes.  Otherwise attempt to read without
+ * blocking.
+ * Returns 1 meaning 'success' if read is succesfull without switching off
+ * readhaead mode. Otherwise return failure.
+ */
+static int
+blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read,
+			struct file_ra_state *ra, int block)
+{
+	int actual;
+
+	if (!block && bdi_read_congested(mapping->backing_dev_info))
+		return 0;
+
+	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+
+	return check_ra_success(ra, nr_to_read, actual);
+}
+
+static int make_ahead_window(struct address_space *mapping, struct file *filp,
+				struct file_ra_state *ra, int force)
+{
+	int block, ret;
+
+	ra->ahead_size = get_next_ra_size(ra);
+	ra->ahead_start = ra->start + ra->size;
+
+	block = force || (ra->prev_page >= ra->ahead_start);
+	ret = blockable_page_cache_readahead(mapping, filp,
+			ra->ahead_start, ra->ahead_size, ra, block);
+
+	if (!ret && !force) {
+		/* A read failure in blocking mode, implies pages are
+		 * all cached. So we can safely assume we have taken
+		 * care of all the pages requested in this call.
+		 * A read failure in non-blocking mode, implies we are
+		 * reading more pages than requested in this call.  So
+		 * we safely assume we have taken care of all the pages
+		 * requested in this call.
+		 *
+		 * Just reset the ahead window in case we failed due to
+		 * congestion.  The ahead window will any way be closed
+		 * in case we failed due to excessive page cache hits.
+		 */
+		ra->ahead_start = 0;
+		ra->ahead_size = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * page_cache_readahead is the main function.  If performs the adaptive
+ * readahead window size management and submits the readahead I/O.
+ */
+unsigned long
+page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
+		     struct file *filp, unsigned long offset,
+		     unsigned long req_size)
+{
+	unsigned long max, newsize;
+	int sequential;
+
+	/*
+	 * We avoid doing extra work and bogusly perturbing the readahead
+	 * window expansion logic.
+	 */
+	if (offset == ra->prev_page && --req_size)
+		++offset;
+
+	/* Note that prev_page == -1 if it is a first read */
+	sequential = (offset == ra->prev_page + 1);
+	ra->prev_page = offset;
+
+	max = get_max_readahead(ra);
+	newsize = min(req_size, max);
+
+	/* No readahead or sub-page sized read or file already in cache */
+	if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
+		goto out;
+
+	ra->prev_page += newsize - 1;
+
+	/*
+	 * Special case - first read at start of file. We'll assume it's
+	 * a whole-file read and grow the window fast.  Or detect first
+	 * sequential access
+	 */
+	if (sequential && ra->size == 0) {
+		ra->size = get_init_ra_size(newsize, max);
+		ra->start = offset;
+		if (!blockable_page_cache_readahead(mapping, filp, offset,
+							 ra->size, ra, 1))
+			goto out;
+
+		/*
+		 * If the request size is larger than our max readahead, we
+		 * at least want to be sure that we get 2 IOs in flight and
+		 * we know that we will definitly need the new I/O.
+		 * once we do this, subsequent calls should be able to overlap
+		 * IOs,* thus preventing stalls. so issue the ahead window
+		 * immediately.
+		 */
+		if (req_size >= max)
+			make_ahead_window(mapping, filp, ra, 1);
+
+		goto out;
+	}
+
+	/*
+	 * Now handle the random case:
+	 * partial page reads and first access were handled above,
+	 * so this must be the next page otherwise it is random
+	 */
+	if (!sequential) {
+		ra_off(ra);
+		blockable_page_cache_readahead(mapping, filp, offset,
+				 newsize, ra, 1);
+		goto out;
+	}
+
+	/*
+	 * If we get here we are doing sequential IO and this was not the first
+	 * occurence (ie we have an existing window)
+	 */
+
+	if (ra->ahead_start == 0) {	 /* no ahead window yet */
+		if (!make_ahead_window(mapping, filp, ra, 0))
+			goto out;
+	}
+	/*
+	 * Already have an ahead window, check if we crossed into it.
+	 * If so, shift windows and issue a new ahead window.
+	 * Only return the #pages that are in the current window, so that
+	 * we get called back on the first page of the ahead window which
+	 * will allow us to submit more IO.
+	 */
+	if (ra->prev_page >= ra->ahead_start) {
+		ra->start = ra->ahead_start;
+		ra->size = ra->ahead_size;
+		make_ahead_window(mapping, filp, ra, 0);
+	}
+
+out:
+	return ra->prev_page + 1;
+}
+
+/*
+ * handle_ra_miss() is called when it is known that a page which should have
+ * been present in the pagecache (we just did some readahead there) was in fact
+ * not found.  This will happen if it was evicted by the VM (readahead
+ * thrashing)
+ *
+ * Turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
+ */
+void handle_ra_miss(struct address_space *mapping,
+		struct file_ra_state *ra, pgoff_t offset)
+{
+	ra->flags |= RA_FLAG_MISS;
+	ra->flags &= ~RA_FLAG_INCACHE;
+}
+
+/*
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
+ */
+unsigned long max_sane_readahead(unsigned long nr)
+{
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+
+	__get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+	return min(nr, (inactive + free) / 2);
+}
diff --git a/mm/rmap.c b/mm/rmap.c
new file mode 100644
index 000000000000..884d6d1928bc
--- /dev/null
+++ b/mm/rmap.c
@@ -0,0 +1,862 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
+ *
+ * Provides methods for unmapping each kind of mapped page:
+ * the anon methods track anonymous pages, and
+ * the file methods track pages belonging to an inode.
+ *
+ * Original design by Rik van Riel <riel@conectiva.com.br> 2001
+ * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
+ * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
+ * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ */
+
+/*
+ * Lock ordering in mm:
+ *
+ * inode->i_sem	(while writing or truncating, not reading or faulting)
+ *   inode->i_alloc_sem
+ *
+ * When a page fault occurs in writing from user to file, down_read
+ * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
+ * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
+ * taken together; in truncation, i_sem is taken outermost.
+ *
+ * mm->mmap_sem
+ *   page->flags PG_locked (lock_page)
+ *     mapping->i_mmap_lock
+ *       anon_vma->lock
+ *         mm->page_table_lock
+ *           zone->lru_lock (in mark_page_accessed)
+ *           swap_list_lock (in swap_free etc's swap_info_get)
+ *             mmlist_lock (in mmput, drain_mmlist and others)
+ *             swap_device_lock (in swap_duplicate, swap_info_get)
+ *             mapping->private_lock (in __set_page_dirty_buffers)
+ *             inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *               sb_lock (within inode_lock in fs/fs-writeback.c)
+ *               mapping->tree_lock (widely used, in set_page_dirty,
+ *                         in arch-dependent flush_dcache_mmap_lock,
+ *                         within inode_lock in __sync_single_inode)
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/rcupdate.h>
+
+#include <asm/tlbflush.h>
+
+//#define RMAP_DEBUG /* can be enabled only for debugging */
+
+kmem_cache_t *anon_vma_cachep;
+
+static inline void validate_anon_vma(struct vm_area_struct *find_vma)
+{
+#ifdef RMAP_DEBUG
+	struct anon_vma *anon_vma = find_vma->anon_vma;
+	struct vm_area_struct *vma;
+	unsigned int mapcount = 0;
+	int found = 0;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mapcount++;
+		BUG_ON(mapcount > 100000);
+		if (vma == find_vma)
+			found = 1;
+	}
+	BUG_ON(!found);
+#endif
+}
+
+/* This must be called under the mmap_sem. */
+int anon_vma_prepare(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	might_sleep();
+	if (unlikely(!anon_vma)) {
+		struct mm_struct *mm = vma->vm_mm;
+		struct anon_vma *allocated, *locked;
+
+		anon_vma = find_mergeable_anon_vma(vma);
+		if (anon_vma) {
+			allocated = NULL;
+			locked = anon_vma;
+			spin_lock(&locked->lock);
+		} else {
+			anon_vma = anon_vma_alloc();
+			if (unlikely(!anon_vma))
+				return -ENOMEM;
+			allocated = anon_vma;
+			locked = NULL;
+		}
+
+		/* page_table_lock to protect against threads */
+		spin_lock(&mm->page_table_lock);
+		if (likely(!vma->anon_vma)) {
+			vma->anon_vma = anon_vma;
+			list_add(&vma->anon_vma_node, &anon_vma->head);
+			allocated = NULL;
+		}
+		spin_unlock(&mm->page_table_lock);
+
+		if (locked)
+			spin_unlock(&locked->lock);
+		if (unlikely(allocated))
+			anon_vma_free(allocated);
+	}
+	return 0;
+}
+
+void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+{
+	BUG_ON(vma->anon_vma != next->anon_vma);
+	list_del(&next->anon_vma_node);
+}
+
+void __anon_vma_link(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	if (anon_vma) {
+		list_add(&vma->anon_vma_node, &anon_vma->head);
+		validate_anon_vma(vma);
+	}
+}
+
+void anon_vma_link(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	if (anon_vma) {
+		spin_lock(&anon_vma->lock);
+		list_add(&vma->anon_vma_node, &anon_vma->head);
+		validate_anon_vma(vma);
+		spin_unlock(&anon_vma->lock);
+	}
+}
+
+void anon_vma_unlink(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	int empty;
+
+	if (!anon_vma)
+		return;
+
+	spin_lock(&anon_vma->lock);
+	validate_anon_vma(vma);
+	list_del(&vma->anon_vma_node);
+
+	/* We must garbage collect the anon_vma if it's empty */
+	empty = list_empty(&anon_vma->head);
+	spin_unlock(&anon_vma->lock);
+
+	if (empty)
+		anon_vma_free(anon_vma);
+}
+
+static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+						SLAB_CTOR_CONSTRUCTOR) {
+		struct anon_vma *anon_vma = data;
+
+		spin_lock_init(&anon_vma->lock);
+		INIT_LIST_HEAD(&anon_vma->head);
+	}
+}
+
+void __init anon_vma_init(void)
+{
+	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+static struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+	struct anon_vma *anon_vma = NULL;
+	unsigned long anon_mapping;
+
+	rcu_read_lock();
+	anon_mapping = (unsigned long) page->mapping;
+	if (!(anon_mapping & PAGE_MAPPING_ANON))
+		goto out;
+	if (!page_mapped(page))
+		goto out;
+
+	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+out:
+	rcu_read_unlock();
+	return anon_vma;
+}
+
+/*
+ * At what user virtual address is page expected in vma?
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	unsigned long address;
+
+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+		/* page should be within any vma from prio_tree_next */
+		BUG_ON(!PageAnon(page));
+		return -EFAULT;
+	}
+	return address;
+}
+
+/*
+ * At what user virtual address is page expected in vma? checking that the
+ * page matches the vma: currently only used by unuse_process, on anon pages.
+ */
+unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+	if (PageAnon(page)) {
+		if ((void *)vma->anon_vma !=
+		    (void *)page->mapping - PAGE_MAPPING_ANON)
+			return -EFAULT;
+	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+		if (vma->vm_file->f_mapping != page->mapping)
+			return -EFAULT;
+	} else
+		return -EFAULT;
+	return vma_address(page, vma);
+}
+
+/*
+ * Subfunctions of page_referenced: page_referenced_one called
+ * repeatedly from either page_referenced_anon or page_referenced_file.
+ */
+static int page_referenced_one(struct page *page,
+	struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int referenced = 0;
+
+	if (!get_mm_counter(mm, rss))
+		goto out;
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	spin_lock(&mm->page_table_lock);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	if (ptep_clear_flush_young(vma, address, pte))
+		referenced++;
+
+	if (mm != current->mm && !ignore_token && has_swap_token(mm))
+		referenced++;
+
+	(*mapcount)--;
+
+out_unmap:
+	pte_unmap(pte);
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+out:
+	return referenced;
+}
+
+static int page_referenced_anon(struct page *page, int ignore_token)
+{
+	unsigned int mapcount;
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	int referenced = 0;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return referenced;
+
+	mapcount = page_mapcount(page);
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		referenced += page_referenced_one(page, vma, &mapcount,
+							ignore_token);
+		if (!mapcount)
+			break;
+	}
+	spin_unlock(&anon_vma->lock);
+	return referenced;
+}
+
+/**
+ * page_referenced_file - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag.  This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds.  It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ */
+static int page_referenced_file(struct page *page, int ignore_token)
+{
+	unsigned int mapcount;
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int referenced = 0;
+
+	/*
+	 * The caller's checks on page->mapping and !PageAnon have made
+	 * sure that this is a file page: the check for page->mapping
+	 * excludes the case just before it gets set on an anon page.
+	 */
+	BUG_ON(PageAnon(page));
+
+	/*
+	 * The page lock not only makes sure that page->mapping cannot
+	 * suddenly be NULLified by truncation, it makes sure that the
+	 * structure at mapping cannot be freed and reused yet,
+	 * so we can safely take mapping->i_mmap_lock.
+	 */
+	BUG_ON(!PageLocked(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	/*
+	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
+	 * is more likely to be accurate if we note it after spinning.
+	 */
+	mapcount = page_mapcount(page);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
+				  == (VM_LOCKED|VM_MAYSHARE)) {
+			referenced++;
+			break;
+		}
+		referenced += page_referenced_one(page, vma, &mapcount,
+							ignore_token);
+		if (!mapcount)
+			break;
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return referenced;
+}
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ * @is_locked: caller holds lock on the page
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of ptes which referenced the page.
+ */
+int page_referenced(struct page *page, int is_locked, int ignore_token)
+{
+	int referenced = 0;
+
+	if (!swap_token_default_timeout)
+		ignore_token = 1;
+
+	if (page_test_and_clear_young(page))
+		referenced++;
+
+	if (TestClearPageReferenced(page))
+		referenced++;
+
+	if (page_mapped(page) && page->mapping) {
+		if (PageAnon(page))
+			referenced += page_referenced_anon(page, ignore_token);
+		else if (is_locked)
+			referenced += page_referenced_file(page, ignore_token);
+		else if (TestSetPageLocked(page))
+			referenced++;
+		else {
+			if (page->mapping)
+				referenced += page_referenced_file(page,
+								ignore_token);
+			unlock_page(page);
+		}
+	}
+	return referenced;
+}
+
+/**
+ * page_add_anon_rmap - add pte mapping to an anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	pgoff_t index;
+
+	BUG_ON(PageReserved(page));
+	BUG_ON(!anon_vma);
+
+	inc_mm_counter(vma->vm_mm, anon_rss);
+
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	index = (address - vma->vm_start) >> PAGE_SHIFT;
+	index += vma->vm_pgoff;
+	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+
+	if (atomic_inc_and_test(&page->_mapcount)) {
+		page->index = index;
+		page->mapping = (struct address_space *) anon_vma;
+		inc_page_state(nr_mapped);
+	}
+	/* else checking page index and mapping is racy */
+}
+
+/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_file_rmap(struct page *page)
+{
+	BUG_ON(PageAnon(page));
+	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
+		return;
+
+	if (atomic_inc_and_test(&page->_mapcount))
+		inc_page_state(nr_mapped);
+}
+
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ *
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void page_remove_rmap(struct page *page)
+{
+	BUG_ON(PageReserved(page));
+
+	if (atomic_add_negative(-1, &page->_mapcount)) {
+		BUG_ON(page_mapcount(page) < 0);
+		/*
+		 * It would be tidy to reset the PageAnon mapping here,
+		 * but that might overwrite a racing page_add_anon_rmap
+		 * which increments mapcount after us but sets mapping
+		 * before us: so leave the reset to free_hot_cold_page,
+		 * and remember that it's only reliable while mapped.
+		 * Leaving it set also helps swapoff to reinstate ptes
+		 * faster for those pages still in swapcache.
+		 */
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		dec_page_state(nr_mapped);
+	}
+}
+
+/*
+ * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ */
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	pte_t pteval;
+	int ret = SWAP_AGAIN;
+
+	if (!get_mm_counter(mm, rss))
+		goto out;
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	/*
+	 * If the page is mlock()d, we cannot swap it out.
+	 * If it's recently referenced (perhaps page_referenced
+	 * skipped over this mm) then we should reactivate it.
+	 */
+	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
+			ptep_clear_flush_young(vma, address, pte)) {
+		ret = SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	/*
+	 * Don't pull an anonymous page out from under get_user_pages.
+	 * GUP carefully breaks COW and raises page count (while holding
+	 * page_table_lock, as we have here) to make sure that the page
+	 * cannot be freed.  If we unmap that page here, a user write
+	 * access to the virtual address will bring back the page, but
+	 * its raised count will (ironically) be taken to mean it's not
+	 * an exclusive swap page, do_wp_page will replace it by a copy
+	 * page, and the user never get to see the data GUP was holding
+	 * the original page for.
+	 *
+	 * This test is also useful for when swapoff (unuse_process) has
+	 * to drop page lock: its reference to the page stops existing
+	 * ptes from being unmapped, so swapoff can make progress.
+	 */
+	if (PageSwapCache(page) &&
+	    page_count(page) != page_mapcount(page) + 2) {
+		ret = SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	/* Nuke the page table entry. */
+	flush_cache_page(vma, address, page_to_pfn(page));
+	pteval = ptep_clear_flush(vma, address, pte);
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(pteval))
+		set_page_dirty(page);
+
+	if (PageAnon(page)) {
+		swp_entry_t entry = { .val = page->private };
+		/*
+		 * Store the swap location in the pte.
+		 * See handle_pte_fault() ...
+		 */
+		BUG_ON(!PageSwapCache(page));
+		swap_duplicate(entry);
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+		BUG_ON(pte_file(*pte));
+		dec_mm_counter(mm, anon_rss);
+	}
+
+	inc_mm_counter(mm, rss);
+	page_remove_rmap(page);
+	page_cache_release(page);
+
+out_unmap:
+	pte_unmap(pte);
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+out:
+	return ret;
+}
+
+/*
+ * objrmap doesn't work for nonlinear VMAs because the assumption that
+ * offset-into-file correlates with offset-into-virtual-addresses does not hold.
+ * Consequently, given a particular page and its ->index, we cannot locate the
+ * ptes which are mapping that page without an exhaustive linear search.
+ *
+ * So what this code does is a mini "virtual scan" of each nonlinear VMA which
+ * maps the file to which the target page belongs.  The ->vm_private_data field
+ * holds the current cursor into that scan.  Successive searches will circulate
+ * around the vma's virtual address space.
+ *
+ * So as more replacement pressure is applied to the pages in a nonlinear VMA,
+ * more scanning pressure is placed against them as well.   Eventually pages
+ * will become fully unmapped and are eligible for eviction.
+ *
+ * For very sparsely populated VMAs this is a little inefficient - chances are
+ * there there won't be many ptes located within the scan cluster.  In this case
+ * maybe we could scan further - to the end of the pte page, perhaps.
+ */
+#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
+#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
+
+static void try_to_unmap_cluster(unsigned long cursor,
+	unsigned int *mapcount, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	pte_t pteval;
+	struct page *page;
+	unsigned long address;
+	unsigned long end;
+	unsigned long pfn;
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
+
+	address = (vma->vm_start + cursor) & CLUSTER_MASK;
+	end = address + CLUSTER_SIZE;
+	if (address < vma->vm_start)
+		address = vma->vm_start;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	for (pte = pte_offset_map(pmd, address);
+			address < end; pte++, address += PAGE_SIZE) {
+
+		if (!pte_present(*pte))
+			continue;
+
+		pfn = pte_pfn(*pte);
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		BUG_ON(PageAnon(page));
+		if (PageReserved(page))
+			continue;
+
+		if (ptep_clear_flush_young(vma, address, pte))
+			continue;
+
+		/* Nuke the page table entry. */
+		flush_cache_page(vma, address, pfn);
+		pteval = ptep_clear_flush(vma, address, pte);
+
+		/* If nonlinear, store the file page offset in the pte. */
+		if (page->index != linear_page_index(vma, address))
+			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
+
+		/* Move the dirty bit to the physical page now the pte is gone. */
+		if (pte_dirty(pteval))
+			set_page_dirty(page);
+
+		page_remove_rmap(page);
+		page_cache_release(page);
+		dec_mm_counter(mm, rss);
+		(*mapcount)--;
+	}
+
+	pte_unmap(pte);
+
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+}
+
+static int try_to_unmap_anon(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	int ret = SWAP_AGAIN;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return ret;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		ret = try_to_unmap_one(page, vma);
+		if (ret == SWAP_FAIL || !page_mapped(page))
+			break;
+	}
+	spin_unlock(&anon_vma->lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap_file - unmap file page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap for object-based pages.
+ */
+static int try_to_unmap_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = SWAP_AGAIN;
+	unsigned long cursor;
+	unsigned long max_nl_cursor = 0;
+	unsigned long max_nl_size = 0;
+	unsigned int mapcount;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		ret = try_to_unmap_one(page, vma);
+		if (ret == SWAP_FAIL || !page_mapped(page))
+			goto out;
+	}
+
+	if (list_empty(&mapping->i_mmap_nonlinear))
+		goto out;
+
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+			continue;
+		cursor = (unsigned long) vma->vm_private_data;
+		if (cursor > max_nl_cursor)
+			max_nl_cursor = cursor;
+		cursor = vma->vm_end - vma->vm_start;
+		if (cursor > max_nl_size)
+			max_nl_size = cursor;
+	}
+
+	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
+		ret = SWAP_FAIL;
+		goto out;
+	}
+
+	/*
+	 * We don't try to search for this page in the nonlinear vmas,
+	 * and page_referenced wouldn't have found it anyway.  Instead
+	 * just walk the nonlinear vmas trying to age and unmap some.
+	 * The mapcount of the page we came in with is irrelevant,
+	 * but even so use it as a guide to how hard we should try?
+	 */
+	mapcount = page_mapcount(page);
+	if (!mapcount)
+		goto out;
+	cond_resched_lock(&mapping->i_mmap_lock);
+
+	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
+	if (max_nl_cursor == 0)
+		max_nl_cursor = CLUSTER_SIZE;
+
+	do {
+		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+				continue;
+			cursor = (unsigned long) vma->vm_private_data;
+			while (get_mm_counter(vma->vm_mm, rss) &&
+				cursor < max_nl_cursor &&
+				cursor < vma->vm_end - vma->vm_start) {
+				try_to_unmap_cluster(cursor, &mapcount, vma);
+				cursor += CLUSTER_SIZE;
+				vma->vm_private_data = (void *) cursor;
+				if ((int)mapcount <= 0)
+					goto out;
+			}
+			vma->vm_private_data = (void *) max_nl_cursor;
+		}
+		cond_resched_lock(&mapping->i_mmap_lock);
+		max_nl_cursor += CLUSTER_SIZE;
+	} while (max_nl_cursor <= max_nl_size);
+
+	/*
+	 * Don't loop forever (perhaps all the remaining pages are
+	 * in locked vmas).  Reset cursor on all unreserved nonlinear
+	 * vmas, now forgetting on which ones it had fallen behind.
+	 */
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (!(vma->vm_flags & VM_RESERVED))
+			vma->vm_private_data = NULL;
+	}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold the page lock.
+ * Return values are:
+ *
+ * SWAP_SUCCESS	- we succeeded in removing all mappings
+ * SWAP_AGAIN	- we missed a mapping, try again later
+ * SWAP_FAIL	- the page is unswappable
+ */
+int try_to_unmap(struct page *page)
+{
+	int ret;
+
+	BUG_ON(PageReserved(page));
+	BUG_ON(!PageLocked(page));
+
+	if (PageAnon(page))
+		ret = try_to_unmap_anon(page);
+	else
+		ret = try_to_unmap_file(page);
+
+	if (!page_mapped(page))
+		ret = SWAP_SUCCESS;
+	return ret;
+}
diff --git a/mm/shmem.c b/mm/shmem.c
new file mode 100644
index 000000000000..61574b81d979
--- /dev/null
+++ b/mm/shmem.c
@@ -0,0 +1,2326 @@
+/*
+ * Resizable virtual memory filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *		 2000 Transmeta Corp.
+ *		 2000-2001 Christoph Rohland
+ *		 2000-2001 SAP AG
+ *		 2002 Red Hat Inc.
+ * Copyright (C) 2002-2004 Hugh Dickins.
+ * Copyright (C) 2002-2004 VERITAS Software Corporation.
+ * Copyright (C) 2004 Andi Kleen, SuSE Labs
+ *
+ * Extended attribute support for tmpfs:
+ * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+/*
+ * This virtual memory filesystem is heavily based on the ramfs. It
+ * extends ramfs by the ability to use swap and honor resource limits
+ * which makes it a completely usable filesystem.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/shmem_fs.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/vfs.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/swapops.h>
+#include <linux/mempolicy.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <asm/pgtable.h>
+
+/* This magic number is used in glibc for posix shared memory */
+#define TMPFS_MAGIC	0x01021994
+
+#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
+#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
+
+#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
+
+#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+
+/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
+#define SHMEM_PAGEIN	 VM_READ
+#define SHMEM_TRUNCATE	 VM_WRITE
+
+/* Definition to limit shmem_truncate's steps between cond_rescheds */
+#define LATENCY_LIMIT	 64
+
+/* Pretend that each entry is of this size in directory's i_size */
+#define BOGO_DIRENT_SIZE 20
+
+/* Keep swapped page count in private field of indirect struct page */
+#define nr_swapped		private
+
+/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+enum sgp_type {
+	SGP_QUICK,	/* don't try more than file page cache lookup */
+	SGP_READ,	/* don't exceed i_size, don't allocate page */
+	SGP_CACHE,	/* don't exceed i_size, may allocate page */
+	SGP_WRITE,	/* may exceed i_size, may allocate page */
+};
+
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+			 struct page **pagep, enum sgp_type sgp, int *type);
+
+static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
+{
+	/*
+	 * The above definition of ENTRIES_PER_PAGE, and the use of
+	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
+	 * might be reconsidered if it ever diverges from PAGE_SIZE.
+	 */
+	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static inline void shmem_dir_free(struct page *page)
+{
+	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static struct page **shmem_dir_map(struct page *page)
+{
+	return (struct page **)kmap_atomic(page, KM_USER0);
+}
+
+static inline void shmem_dir_unmap(struct page **dir)
+{
+	kunmap_atomic(dir, KM_USER0);
+}
+
+static swp_entry_t *shmem_swp_map(struct page *page)
+{
+	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
+}
+
+static inline void shmem_swp_balance_unmap(void)
+{
+	/*
+	 * When passing a pointer to an i_direct entry, to code which
+	 * also handles indirect entries and so will shmem_swp_unmap,
+	 * we must arrange for the preempt count to remain in balance.
+	 * What kmap_atomic of a lowmem page does depends on config
+	 * and architecture, so pretend to kmap_atomic some lowmem page.
+	 */
+	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
+}
+
+static inline void shmem_swp_unmap(swp_entry_t *entry)
+{
+	kunmap_atomic(entry, KM_USER1);
+}
+
+static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/*
+ * shmem_file_setup pre-accounts the whole fixed size of a VM object,
+ * for shared memory and for shared anonymous (/dev/zero) mappings
+ * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
+ * consistent with the pre-accounting of private mappings ...
+ */
+static inline int shmem_acct_size(unsigned long flags, loff_t size)
+{
+	return (flags & VM_ACCOUNT)?
+		security_vm_enough_memory(VM_ACCT(size)): 0;
+}
+
+static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+{
+	if (flags & VM_ACCOUNT)
+		vm_unacct_memory(VM_ACCT(size));
+}
+
+/*
+ * ... whereas tmpfs objects are accounted incrementally as
+ * pages are allocated, in order to allow huge sparse files.
+ * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
+ */
+static inline int shmem_acct_block(unsigned long flags)
+{
+	return (flags & VM_ACCOUNT)?
+		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+}
+
+static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+{
+	if (!(flags & VM_ACCOUNT))
+		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+}
+
+static struct super_operations shmem_ops;
+static struct address_space_operations shmem_aops;
+static struct file_operations shmem_file_operations;
+static struct inode_operations shmem_inode_operations;
+static struct inode_operations shmem_dir_inode_operations;
+static struct inode_operations shmem_special_inode_operations;
+static struct vm_operations_struct shmem_vm_ops;
+
+static struct backing_dev_info shmem_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.unplug_io_fn	= default_unplug_io_fn,
+};
+
+static LIST_HEAD(shmem_swaplist);
+static DEFINE_SPINLOCK(shmem_swaplist_lock);
+
+static void shmem_free_blocks(struct inode *inode, long pages)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_blocks += pages;
+		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+/*
+ * shmem_recalc_inode - recalculate the size of an inode
+ *
+ * @inode: inode to recalc
+ *
+ * We have to calculate the free blocks since the mm can drop
+ * undirtied hole pages behind our back.
+ *
+ * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
+ * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * It has to be called with the spinlock held.
+ */
+static void shmem_recalc_inode(struct inode *inode)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	long freed;
+
+	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+	if (freed > 0) {
+		info->alloced -= freed;
+		shmem_unacct_blocks(info->flags, freed);
+		shmem_free_blocks(inode, freed);
+	}
+}
+
+/*
+ * shmem_swp_entry - find the swap vector position in the info structure
+ *
+ * @info:  info structure for the inode
+ * @index: index of the page to find
+ * @page:  optional page to add to the structure. Has to be preset to
+ *         all zeros
+ *
+ * If there is no space allocated yet it will return NULL when
+ * page is NULL, else it will use the page for the needed block,
+ * setting it to NULL on return to indicate that it has been used.
+ *
+ * The swap vector is organized the following way:
+ *
+ * There are SHMEM_NR_DIRECT entries directly stored in the
+ * shmem_inode_info structure. So small files do not need an addional
+ * allocation.
+ *
+ * For pages with index > SHMEM_NR_DIRECT there is the pointer
+ * i_indirect which points to a page which holds in the first half
+ * doubly indirect blocks, in the second half triple indirect blocks:
+ *
+ * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
+ * following layout (for SHMEM_NR_DIRECT == 16):
+ *
+ * i_indirect -> dir --> 16-19
+ * 	      |	     +-> 20-23
+ * 	      |
+ * 	      +-->dir2 --> 24-27
+ * 	      |	       +-> 28-31
+ * 	      |	       +-> 32-35
+ * 	      |	       +-> 36-39
+ * 	      |
+ * 	      +-->dir3 --> 40-43
+ * 	       	       +-> 44-47
+ * 	      	       +-> 48-51
+ * 	      	       +-> 52-55
+ */
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+{
+	unsigned long offset;
+	struct page **dir;
+	struct page *subdir;
+
+	if (index < SHMEM_NR_DIRECT) {
+		shmem_swp_balance_unmap();
+		return info->i_direct+index;
+	}
+	if (!info->i_indirect) {
+		if (page) {
+			info->i_indirect = *page;
+			*page = NULL;
+		}
+		return NULL;			/* need another page */
+	}
+
+	index -= SHMEM_NR_DIRECT;
+	offset = index % ENTRIES_PER_PAGE;
+	index /= ENTRIES_PER_PAGE;
+	dir = shmem_dir_map(info->i_indirect);
+
+	if (index >= ENTRIES_PER_PAGE/2) {
+		index -= ENTRIES_PER_PAGE/2;
+		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
+		index %= ENTRIES_PER_PAGE;
+		subdir = *dir;
+		if (!subdir) {
+			if (page) {
+				*dir = *page;
+				*page = NULL;
+			}
+			shmem_dir_unmap(dir);
+			return NULL;		/* need another page */
+		}
+		shmem_dir_unmap(dir);
+		dir = shmem_dir_map(subdir);
+	}
+
+	dir += index;
+	subdir = *dir;
+	if (!subdir) {
+		if (!page || !(subdir = *page)) {
+			shmem_dir_unmap(dir);
+			return NULL;		/* need a page */
+		}
+		*dir = subdir;
+		*page = NULL;
+	}
+	shmem_dir_unmap(dir);
+	return shmem_swp_map(subdir) + offset;
+}
+
+static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+{
+	long incdec = value? 1: -1;
+
+	entry->val = value;
+	info->swapped += incdec;
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
+		kmap_atomic_to_page(entry)->nr_swapped += incdec;
+}
+
+/*
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ *                   If it does not exist allocate the entry.
+ *
+ * @info:	info structure for the inode
+ * @index:	index of the page to find
+ * @sgp:	check and recheck i_size? skip allocation?
+ */
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct page *page = NULL;
+	swp_entry_t *entry;
+
+	if (sgp != SGP_WRITE &&
+	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+		return ERR_PTR(-EINVAL);
+
+	while (!(entry = shmem_swp_entry(info, index, &page))) {
+		if (sgp == SGP_READ)
+			return shmem_swp_map(ZERO_PAGE(0));
+		/*
+		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * page (and perhaps indirect index pages) yet to allocate:
+		 * a waste to allocate index if we cannot allocate data.
+		 */
+		if (sbinfo) {
+			spin_lock(&sbinfo->stat_lock);
+			if (sbinfo->free_blocks <= 1) {
+				spin_unlock(&sbinfo->stat_lock);
+				return ERR_PTR(-ENOSPC);
+			}
+			sbinfo->free_blocks--;
+			inode->i_blocks += BLOCKS_PER_PAGE;
+			spin_unlock(&sbinfo->stat_lock);
+		}
+
+		spin_unlock(&info->lock);
+		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+		if (page) {
+			page->nr_swapped = 0;
+		}
+		spin_lock(&info->lock);
+
+		if (!page) {
+			shmem_free_blocks(inode, 1);
+			return ERR_PTR(-ENOMEM);
+		}
+		if (sgp != SGP_WRITE &&
+		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+			entry = ERR_PTR(-EINVAL);
+			break;
+		}
+		if (info->next_index <= index)
+			info->next_index = index + 1;
+	}
+	if (page) {
+		/* another task gave its page, or truncated the file */
+		shmem_free_blocks(inode, 1);
+		shmem_dir_free(page);
+	}
+	if (info->next_index <= index && !IS_ERR(entry))
+		info->next_index = index + 1;
+	return entry;
+}
+
+/*
+ * shmem_free_swp - free some swap entries in a directory
+ *
+ * @dir:   pointer to the directory
+ * @edir:  pointer after last entry of the directory
+ */
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
+{
+	swp_entry_t *ptr;
+	int freed = 0;
+
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val) {
+			free_swap_and_cache(*ptr);
+			*ptr = (swp_entry_t){0};
+			freed++;
+		}
+	}
+	return freed;
+}
+
+static int shmem_map_and_free_swp(struct page *subdir,
+		int offset, int limit, struct page ***dir)
+{
+	swp_entry_t *ptr;
+	int freed = 0;
+
+	ptr = shmem_swp_map(subdir);
+	for (; offset < limit; offset += LATENCY_LIMIT) {
+		int size = limit - offset;
+		if (size > LATENCY_LIMIT)
+			size = LATENCY_LIMIT;
+		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
+		if (need_resched()) {
+			shmem_swp_unmap(ptr);
+			if (*dir) {
+				shmem_dir_unmap(*dir);
+				*dir = NULL;
+			}
+			cond_resched();
+			ptr = shmem_swp_map(subdir);
+		}
+	}
+	shmem_swp_unmap(ptr);
+	return freed;
+}
+
+static void shmem_free_pages(struct list_head *next)
+{
+	struct page *page;
+	int freed = 0;
+
+	do {
+		page = container_of(next, struct page, lru);
+		next = next->next;
+		shmem_dir_free(page);
+		freed++;
+		if (freed >= LATENCY_LIMIT) {
+			cond_resched();
+			freed = 0;
+		}
+	} while (next);
+}
+
+static void shmem_truncate(struct inode *inode)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	unsigned long diroff;
+	struct page **dir;
+	struct page *topdir;
+	struct page *middir;
+	struct page *subdir;
+	swp_entry_t *ptr;
+	LIST_HEAD(pages_to_free);
+	long nr_pages_to_free = 0;
+	long nr_swaps_freed = 0;
+	int offset;
+	int freed;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (idx >= info->next_index)
+		return;
+
+	spin_lock(&info->lock);
+	info->flags |= SHMEM_TRUNCATE;
+	limit = info->next_index;
+	info->next_index = idx;
+	topdir = info->i_indirect;
+	if (topdir && idx <= SHMEM_NR_DIRECT) {
+		info->i_indirect = NULL;
+		nr_pages_to_free++;
+		list_add(&topdir->lru, &pages_to_free);
+	}
+	spin_unlock(&info->lock);
+
+	if (info->swapped && idx < SHMEM_NR_DIRECT) {
+		ptr = info->i_direct;
+		size = limit;
+		if (size > SHMEM_NR_DIRECT)
+			size = SHMEM_NR_DIRECT;
+		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+	}
+	if (!topdir)
+		goto done2;
+
+	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	limit -= SHMEM_NR_DIRECT;
+	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+	offset = idx % ENTRIES_PER_PAGE;
+	idx -= offset;
+
+	dir = shmem_dir_map(topdir);
+	stage = ENTRIES_PER_PAGEPAGE/2;
+	if (idx < ENTRIES_PER_PAGEPAGE/2) {
+		middir = topdir;
+		diroff = idx/ENTRIES_PER_PAGE;
+	} else {
+		dir += ENTRIES_PER_PAGE/2;
+		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+		while (stage <= idx)
+			stage += ENTRIES_PER_PAGEPAGE;
+		middir = *dir;
+		if (*dir) {
+			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+			if (!diroff && !offset) {
+				*dir = NULL;
+				nr_pages_to_free++;
+				list_add(&middir->lru, &pages_to_free);
+			}
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(middir);
+		} else {
+			diroff = 0;
+			offset = 0;
+			idx = stage;
+		}
+	}
+
+	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(topdir) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto done1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			middir = *dir;
+			*dir = NULL;
+			nr_pages_to_free++;
+			list_add(&middir->lru, &pages_to_free);
+			shmem_dir_unmap(dir);
+			cond_resched();
+			dir = shmem_dir_map(middir);
+			diroff = 0;
+		}
+		subdir = dir[diroff];
+		if (subdir && subdir->nr_swapped) {
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			freed = shmem_map_and_free_swp(subdir,
+						offset, size, &dir);
+			if (!dir)
+				dir = shmem_dir_map(middir);
+			nr_swaps_freed += freed;
+			if (offset)
+				spin_lock(&info->lock);
+			subdir->nr_swapped -= freed;
+			if (offset)
+				spin_unlock(&info->lock);
+			BUG_ON(subdir->nr_swapped > offset);
+		}
+		if (offset)
+			offset = 0;
+		else if (subdir) {
+			dir[diroff] = NULL;
+			nr_pages_to_free++;
+			list_add(&subdir->lru, &pages_to_free);
+		}
+	}
+done1:
+	shmem_dir_unmap(dir);
+done2:
+	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+		/*
+		 * Call truncate_inode_pages again: racing shmem_unuse_inode
+		 * may have swizzled a page in from swap since vmtruncate or
+		 * generic_delete_inode did it, before we lowered next_index.
+		 * Also, though shmem_getpage checks i_size before adding to
+		 * cache, no recheck after: so fix the narrow window there too.
+		 */
+		truncate_inode_pages(inode->i_mapping, inode->i_size);
+	}
+
+	spin_lock(&info->lock);
+	info->flags &= ~SHMEM_TRUNCATE;
+	info->swapped -= nr_swaps_freed;
+	if (nr_pages_to_free)
+		shmem_free_blocks(inode, nr_pages_to_free);
+	shmem_recalc_inode(inode);
+	spin_unlock(&info->lock);
+
+	/*
+	 * Empty swap vector directory pages to be freed?
+	 */
+	if (!list_empty(&pages_to_free)) {
+		pages_to_free.prev->next = NULL;
+		shmem_free_pages(pages_to_free.next);
+	}
+}
+
+static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	int error;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size < inode->i_size) {
+			/*
+			 * If truncating down to a partial page, then
+			 * if that page is already allocated, hold it
+			 * in memory until the truncation is over, so
+			 * truncate_partial_page cannnot miss it were
+			 * it assigned to swap.
+			 */
+			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+				(void) shmem_getpage(inode,
+					attr->ia_size>>PAGE_CACHE_SHIFT,
+						&page, SGP_READ, NULL);
+			}
+			/*
+			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
+			 * detect if any pages might have been added to cache
+			 * after truncate_inode_pages.  But we needn't bother
+			 * if it's being fully truncated to zero-length: the
+			 * nrpages check is efficient enough in that case.
+			 */
+			if (attr->ia_size) {
+				struct shmem_inode_info *info = SHMEM_I(inode);
+				spin_lock(&info->lock);
+				info->flags &= ~SHMEM_PAGEIN;
+				spin_unlock(&info->lock);
+			}
+		}
+	}
+
+	error = inode_change_ok(inode, attr);
+	if (!error)
+		error = inode_setattr(inode, attr);
+	if (page)
+		page_cache_release(page);
+	return error;
+}
+
+static void shmem_delete_inode(struct inode *inode)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (inode->i_op->truncate == shmem_truncate) {
+		shmem_unacct_size(info->flags, inode->i_size);
+		inode->i_size = 0;
+		shmem_truncate(inode);
+		if (!list_empty(&info->swaplist)) {
+			spin_lock(&shmem_swaplist_lock);
+			list_del_init(&info->swaplist);
+			spin_unlock(&shmem_swaplist_lock);
+		}
+	}
+	if (sbinfo) {
+		BUG_ON(inode->i_blocks);
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	clear_inode(inode);
+}
+
+static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
+{
+	swp_entry_t *ptr;
+
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val == entry.val)
+			return ptr - dir;
+	}
+	return -1;
+}
+
+static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+{
+	struct inode *inode;
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	struct page **dir;
+	struct page *subdir;
+	swp_entry_t *ptr;
+	int offset;
+
+	idx = 0;
+	ptr = info->i_direct;
+	spin_lock(&info->lock);
+	limit = info->next_index;
+	size = limit;
+	if (size > SHMEM_NR_DIRECT)
+		size = SHMEM_NR_DIRECT;
+	offset = shmem_find_swp(entry, ptr, ptr+size);
+	if (offset >= 0) {
+		shmem_swp_balance_unmap();
+		goto found;
+	}
+	if (!info->i_indirect)
+		goto lost2;
+
+	dir = shmem_dir_map(info->i_indirect);
+	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
+
+	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir-1);
+			dir = shmem_dir_map(info->i_indirect) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto lost1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			subdir = *dir;
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(subdir);
+		}
+		subdir = *dir;
+		if (subdir && subdir->nr_swapped) {
+			ptr = shmem_swp_map(subdir);
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			offset = shmem_find_swp(entry, ptr, ptr+size);
+			if (offset >= 0) {
+				shmem_dir_unmap(dir);
+				goto found;
+			}
+			shmem_swp_unmap(ptr);
+		}
+	}
+lost1:
+	shmem_dir_unmap(dir-1);
+lost2:
+	spin_unlock(&info->lock);
+	return 0;
+found:
+	idx += offset;
+	inode = &info->vfs_inode;
+	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
+		info->flags |= SHMEM_PAGEIN;
+		shmem_swp_set(info, ptr + offset, 0);
+	}
+	shmem_swp_unmap(ptr);
+	spin_unlock(&info->lock);
+	/*
+	 * Decrement swap count even when the entry is left behind:
+	 * try_to_unuse will skip over mms, then reincrement count.
+	 */
+	swap_free(entry);
+	return 1;
+}
+
+/*
+ * shmem_unuse() search for an eventually swapped out shmem page.
+ */
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+	struct list_head *p, *next;
+	struct shmem_inode_info *info;
+	int found = 0;
+
+	spin_lock(&shmem_swaplist_lock);
+	list_for_each_safe(p, next, &shmem_swaplist) {
+		info = list_entry(p, struct shmem_inode_info, swaplist);
+		if (!info->swapped)
+			list_del_init(&info->swaplist);
+		else if (shmem_unuse_inode(info, entry, page)) {
+			/* move head to start search for next from here */
+			list_move_tail(&shmem_swaplist, &info->swaplist);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&shmem_swaplist_lock);
+	return found;
+}
+
+/*
+ * Move the page from the page cache to the swap cache.
+ */
+static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct shmem_inode_info *info;
+	swp_entry_t *entry, swap;
+	struct address_space *mapping;
+	unsigned long index;
+	struct inode *inode;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(page_mapped(page));
+
+	mapping = page->mapping;
+	index = page->index;
+	inode = mapping->host;
+	info = SHMEM_I(inode);
+	if (info->flags & VM_LOCKED)
+		goto redirty;
+	swap = get_swap_page();
+	if (!swap.val)
+		goto redirty;
+
+	spin_lock(&info->lock);
+	shmem_recalc_inode(inode);
+	if (index >= info->next_index) {
+		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
+		goto unlock;
+	}
+	entry = shmem_swp_entry(info, index, NULL);
+	BUG_ON(!entry);
+	BUG_ON(entry->val);
+
+	if (move_to_swap_cache(page, swap) == 0) {
+		shmem_swp_set(info, entry, swap.val);
+		shmem_swp_unmap(entry);
+		spin_unlock(&info->lock);
+		if (list_empty(&info->swaplist)) {
+			spin_lock(&shmem_swaplist_lock);
+			/* move instead of add in case we're racing */
+			list_move_tail(&info->swaplist, &shmem_swaplist);
+			spin_unlock(&shmem_swaplist_lock);
+		}
+		unlock_page(page);
+		return 0;
+	}
+
+	shmem_swp_unmap(entry);
+unlock:
+	spin_unlock(&info->lock);
+	swap_free(swap);
+redirty:
+	set_page_dirty(page);
+	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+}
+
+#ifdef CONFIG_NUMA
+static struct page *shmem_swapin_async(struct shared_policy *p,
+				       swp_entry_t entry, unsigned long idx)
+{
+	struct page *page;
+	struct vm_area_struct pvma;
+
+	/* Create a pseudo vma that just contains the policy */
+	memset(&pvma, 0, sizeof(struct vm_area_struct));
+	pvma.vm_end = PAGE_SIZE;
+	pvma.vm_pgoff = idx;
+	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
+	page = read_swap_cache_async(entry, &pvma, 0);
+	mpol_free(pvma.vm_policy);
+	return page;
+}
+
+struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
+			  unsigned long idx)
+{
+	struct shared_policy *p = &info->policy;
+	int i, num;
+	struct page *page;
+	unsigned long offset;
+
+	num = valid_swaphandles(entry, &offset);
+	for (i = 0; i < num; offset++, i++) {
+		page = shmem_swapin_async(p,
+				swp_entry(swp_type(entry), offset), idx);
+		if (!page)
+			break;
+		page_cache_release(page);
+	}
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+	return shmem_swapin_async(p, entry, idx);
+}
+
+static struct page *
+shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
+		 unsigned long idx)
+{
+	struct vm_area_struct pvma;
+	struct page *page;
+
+	memset(&pvma, 0, sizeof(struct vm_area_struct));
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	pvma.vm_pgoff = idx;
+	pvma.vm_end = PAGE_SIZE;
+	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
+	mpol_free(pvma.vm_policy);
+	return page;
+}
+#else
+static inline struct page *
+shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+{
+	swapin_readahead(entry, 0, NULL);
+	return read_swap_cache_async(entry, NULL, 0);
+}
+
+static inline struct page *
+shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
+				 unsigned long idx)
+{
+	return alloc_page(gfp | __GFP_ZERO);
+}
+#endif
+
+/*
+ * shmem_getpage - either get the page from swap or allocate a new one
+ *
+ * If we allocate a new one we do not mark it dirty. That's up to the
+ * vm. If we swap it in we mark it dirty since we also free the swap
+ * entry since a page cannot live in both the swap and page cache
+ */
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+			struct page **pagep, enum sgp_type sgp, int *type)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo;
+	struct page *filepage = *pagep;
+	struct page *swappage;
+	swp_entry_t *entry;
+	swp_entry_t swap;
+	int error;
+
+	if (idx >= SHMEM_MAX_INDEX)
+		return -EFBIG;
+	/*
+	 * Normally, filepage is NULL on entry, and either found
+	 * uptodate immediately, or allocated and zeroed, or read
+	 * in under swappage, which is then assigned to filepage.
+	 * But shmem_prepare_write passes in a locked filepage,
+	 * which may be found not uptodate by other callers too,
+	 * and may need to be copied from the swappage read in.
+	 */
+repeat:
+	if (!filepage)
+		filepage = find_lock_page(mapping, idx);
+	if (filepage && PageUptodate(filepage))
+		goto done;
+	error = 0;
+	if (sgp == SGP_QUICK)
+		goto failed;
+
+	spin_lock(&info->lock);
+	shmem_recalc_inode(inode);
+	entry = shmem_swp_alloc(info, idx, sgp);
+	if (IS_ERR(entry)) {
+		spin_unlock(&info->lock);
+		error = PTR_ERR(entry);
+		goto failed;
+	}
+	swap = *entry;
+
+	if (swap.val) {
+		/* Look it up and read it in.. */
+		swappage = lookup_swap_cache(swap);
+		if (!swappage) {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			/* here we actually do the io */
+			if (type && *type == VM_FAULT_MINOR) {
+				inc_page_state(pgmajfault);
+				*type = VM_FAULT_MAJOR;
+			}
+			swappage = shmem_swapin(info, swap, idx);
+			if (!swappage) {
+				spin_lock(&info->lock);
+				entry = shmem_swp_alloc(info, idx, sgp);
+				if (IS_ERR(entry))
+					error = PTR_ERR(entry);
+				else {
+					if (entry->val == swap.val)
+						error = -ENOMEM;
+					shmem_swp_unmap(entry);
+				}
+				spin_unlock(&info->lock);
+				if (error)
+					goto failed;
+				goto repeat;
+			}
+			wait_on_page_locked(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
+
+		/* We have to do this with page locked to prevent races */
+		if (TestSetPageLocked(swappage)) {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			wait_on_page_locked(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
+		if (PageWriteback(swappage)) {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			wait_on_page_writeback(swappage);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
+		if (!PageUptodate(swappage)) {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			error = -EIO;
+			goto failed;
+		}
+
+		if (filepage) {
+			shmem_swp_set(info, entry, 0);
+			shmem_swp_unmap(entry);
+			delete_from_swap_cache(swappage);
+			spin_unlock(&info->lock);
+			copy_highpage(filepage, swappage);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			flush_dcache_page(filepage);
+			SetPageUptodate(filepage);
+			set_page_dirty(filepage);
+			swap_free(swap);
+		} else if (!(error = move_from_swap_cache(
+				swappage, idx, mapping))) {
+			info->flags |= SHMEM_PAGEIN;
+			shmem_swp_set(info, entry, 0);
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			filepage = swappage;
+			swap_free(swap);
+		} else {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			unlock_page(swappage);
+			page_cache_release(swappage);
+			if (error == -ENOMEM) {
+				/* let kswapd refresh zone for GFP_ATOMICs */
+				blk_congestion_wait(WRITE, HZ/50);
+			}
+			goto repeat;
+		}
+	} else if (sgp == SGP_READ && !filepage) {
+		shmem_swp_unmap(entry);
+		filepage = find_get_page(mapping, idx);
+		if (filepage &&
+		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+			spin_unlock(&info->lock);
+			wait_on_page_locked(filepage);
+			page_cache_release(filepage);
+			filepage = NULL;
+			goto repeat;
+		}
+		spin_unlock(&info->lock);
+	} else {
+		shmem_swp_unmap(entry);
+		sbinfo = SHMEM_SB(inode->i_sb);
+		if (sbinfo) {
+			spin_lock(&sbinfo->stat_lock);
+			if (sbinfo->free_blocks == 0 ||
+			    shmem_acct_block(info->flags)) {
+				spin_unlock(&sbinfo->stat_lock);
+				spin_unlock(&info->lock);
+				error = -ENOSPC;
+				goto failed;
+			}
+			sbinfo->free_blocks--;
+			inode->i_blocks += BLOCKS_PER_PAGE;
+			spin_unlock(&sbinfo->stat_lock);
+		} else if (shmem_acct_block(info->flags)) {
+			spin_unlock(&info->lock);
+			error = -ENOSPC;
+			goto failed;
+		}
+
+		if (!filepage) {
+			spin_unlock(&info->lock);
+			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
+						    info,
+						    idx);
+			if (!filepage) {
+				shmem_unacct_blocks(info->flags, 1);
+				shmem_free_blocks(inode, 1);
+				error = -ENOMEM;
+				goto failed;
+			}
+
+			spin_lock(&info->lock);
+			entry = shmem_swp_alloc(info, idx, sgp);
+			if (IS_ERR(entry))
+				error = PTR_ERR(entry);
+			else {
+				swap = *entry;
+				shmem_swp_unmap(entry);
+			}
+			if (error || swap.val || 0 != add_to_page_cache_lru(
+					filepage, mapping, idx, GFP_ATOMIC)) {
+				spin_unlock(&info->lock);
+				page_cache_release(filepage);
+				shmem_unacct_blocks(info->flags, 1);
+				shmem_free_blocks(inode, 1);
+				filepage = NULL;
+				if (error)
+					goto failed;
+				goto repeat;
+			}
+			info->flags |= SHMEM_PAGEIN;
+		}
+
+		info->alloced++;
+		spin_unlock(&info->lock);
+		flush_dcache_page(filepage);
+		SetPageUptodate(filepage);
+	}
+done:
+	if (*pagep != filepage) {
+		unlock_page(filepage);
+		*pagep = filepage;
+	}
+	return 0;
+
+failed:
+	if (*pagep != filepage) {
+		unlock_page(filepage);
+		page_cache_release(filepage);
+	}
+	return error;
+}
+
+struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = NULL;
+	unsigned long idx;
+	int error;
+
+	idx = (address - vma->vm_start) >> PAGE_SHIFT;
+	idx += vma->vm_pgoff;
+	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+		return NOPAGE_SIGBUS;
+
+	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
+	if (error)
+		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
+
+	mark_page_accessed(page);
+	return page;
+}
+
+static int shmem_populate(struct vm_area_struct *vma,
+	unsigned long addr, unsigned long len,
+	pgprot_t prot, unsigned long pgoff, int nonblock)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct mm_struct *mm = vma->vm_mm;
+	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
+	unsigned long size;
+
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
+		return -EINVAL;
+
+	while ((long) len > 0) {
+		struct page *page = NULL;
+		int err;
+		/*
+		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
+		 */
+		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
+		if (err)
+			return err;
+		if (page) {
+			mark_page_accessed(page);
+			err = install_page(mm, vma, addr, page, prot);
+			if (err) {
+				page_cache_release(page);
+				return err;
+			}
+		} else if (nonblock) {
+    			err = install_file_pte(mm, vma, addr, pgoff, prot);
+			if (err)
+	    			return err;
+		}
+
+		len -= PAGE_SIZE;
+		addr += PAGE_SIZE;
+		pgoff++;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_NUMA
+int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct inode *i = vma->vm_file->f_dentry->d_inode;
+	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+}
+
+struct mempolicy *
+shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct inode *i = vma->vm_file->f_dentry->d_inode;
+	unsigned long idx;
+
+	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+}
+#endif
+
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	int retval = -ENOMEM;
+
+	spin_lock(&info->lock);
+	if (lock && !(info->flags & VM_LOCKED)) {
+		if (!user_shm_lock(inode->i_size, user))
+			goto out_nomem;
+		info->flags |= VM_LOCKED;
+	}
+	if (!lock && (info->flags & VM_LOCKED) && user) {
+		user_shm_unlock(inode->i_size, user);
+		info->flags &= ~VM_LOCKED;
+	}
+	retval = 0;
+out_nomem:
+	spin_unlock(&info->lock);
+	return retval;
+}
+
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &shmem_vm_ops;
+	return 0;
+}
+
+static struct inode *
+shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+	struct inode *inode;
+	struct shmem_inode_info *info;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		if (!sbinfo->free_inodes) {
+			spin_unlock(&sbinfo->stat_lock);
+			return NULL;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mapping->a_ops = &shmem_aops;
+		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		info = SHMEM_I(inode);
+		memset(info, 0, (char *)inode - (char *)info);
+		spin_lock_init(&info->lock);
+		INIT_LIST_HEAD(&info->swaplist);
+
+		switch (mode & S_IFMT) {
+		default:
+			inode->i_op = &shmem_special_inode_operations;
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &shmem_inode_operations;
+			inode->i_fop = &shmem_file_operations;
+			mpol_shared_policy_init(&info->policy);
+			break;
+		case S_IFDIR:
+			inode->i_nlink++;
+			/* Some things misbehave if size == 0 on a directory */
+			inode->i_size = 2 * BOGO_DIRENT_SIZE;
+			inode->i_op = &shmem_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			break;
+		case S_IFLNK:
+			/*
+			 * Must not load anything in the rbtree,
+			 * mpol_free_shared_policy will not be called.
+			 */
+			mpol_shared_policy_init(&info->policy);
+			break;
+		}
+	} else if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	return inode;
+}
+
+#ifdef CONFIG_TMPFS
+
+static int shmem_set_size(struct shmem_sb_info *sbinfo,
+			  unsigned long max_blocks, unsigned long max_inodes)
+{
+	int error;
+	unsigned long blocks, inodes;
+
+	spin_lock(&sbinfo->stat_lock);
+	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+	error = -EINVAL;
+	if (max_blocks < blocks)
+		goto out;
+	if (max_inodes < inodes)
+		goto out;
+	error = 0;
+	sbinfo->max_blocks  = max_blocks;
+	sbinfo->free_blocks = max_blocks - blocks;
+	sbinfo->max_inodes  = max_inodes;
+	sbinfo->free_inodes = max_inodes - inodes;
+out:
+	spin_unlock(&sbinfo->stat_lock);
+	return error;
+}
+
+static struct inode_operations shmem_symlink_inode_operations;
+static struct inode_operations shmem_symlink_inline_operations;
+
+/*
+ * Normally tmpfs makes no use of shmem_prepare_write, but it
+ * lets a tmpfs file be used read-write below the loop driver.
+ */
+static int
+shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+}
+
+static ssize_t
+shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct inode	*inode = file->f_dentry->d_inode;
+	loff_t		pos;
+	unsigned long	written;
+	ssize_t		err;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	down(&inode->i_sem);
+
+	pos = *ppos;
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, 0);
+	if (err || !count)
+		goto out;
+
+	err = remove_suid(file->f_dentry);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	do {
+		struct page *page = NULL;
+		unsigned long bytes, index, offset;
+		char *kaddr;
+		int left;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		/*
+		 * We don't hold page lock across copy from user -
+		 * what would it guard against? - so no deadlock here.
+		 * But it still may be a good idea to prefault below.
+		 */
+
+		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
+		if (err)
+			break;
+
+		left = bytes;
+		if (PageHighMem(page)) {
+			volatile unsigned char dummy;
+			__get_user(dummy, buf);
+			__get_user(dummy, buf + bytes - 1);
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			left = __copy_from_user_inatomic(kaddr + offset,
+							buf, bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if (left) {
+			kaddr = kmap(page);
+			left = __copy_from_user(kaddr + offset, buf, bytes);
+			kunmap(page);
+		}
+
+		written += bytes;
+		count -= bytes;
+		pos += bytes;
+		buf += bytes;
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+
+		if (left) {
+			pos -= left;
+			written -= left;
+			err = -EFAULT;
+			break;
+		}
+
+		/*
+		 * Our dirty pages are not counted in nr_dirty,
+		 * and we do not attempt to balance dirty pages.
+		 */
+
+		cond_resched();
+	} while (count);
+
+	*ppos = pos;
+	if (written)
+		err = written;
+out:
+	up(&inode->i_sem);
+	return err;
+}
+
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index, offset;
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+
+	for (;;) {
+		struct page *page = NULL;
+		unsigned long end_index, nr, ret;
+		loff_t i_size = i_size_read(inode);
+
+		end_index = i_size >> PAGE_CACHE_SHIFT;
+		if (index > end_index)
+			break;
+		if (index == end_index) {
+			nr = i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset)
+				break;
+		}
+
+		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+		if (desc->error) {
+			if (desc->error == -EINVAL)
+				desc->error = 0;
+			break;
+		}
+
+		/*
+		 * We must evaluate after, since reads (unlike writes)
+		 * are called without i_sem protection against truncate
+		 */
+		nr = PAGE_CACHE_SIZE;
+		i_size = i_size_read(inode);
+		end_index = i_size >> PAGE_CACHE_SHIFT;
+		if (index == end_index) {
+			nr = i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset) {
+				if (page)
+					page_cache_release(page);
+				break;
+			}
+		}
+		nr -= offset;
+
+		if (page) {
+			/*
+			 * If users can be writing to this page using arbitrary
+			 * virtual addresses, take care about potential aliasing
+			 * before reading the page on the kernel side.
+			 */
+			if (mapping_writably_mapped(mapping))
+				flush_dcache_page(page);
+			/*
+			 * Mark the page accessed if we read the beginning.
+			 */
+			if (!offset)
+				mark_page_accessed(page);
+		} else
+			page = ZERO_PAGE(0);
+
+		/*
+		 * Ok, we have the page, and it's up-to-date, so
+		 * now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		ret = actor(desc, page, offset, nr);
+		offset += ret;
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+
+		page_cache_release(page);
+		if (ret != nr || !desc->count)
+			break;
+
+		cond_resched();
+	}
+
+	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	file_accessed(filp);
+}
+
+static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+	read_descriptor_t desc;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+	if (!count)
+		return 0;
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.buf = buf;
+	desc.error = 0;
+
+	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+	if (desc.written)
+		return desc.written;
+	return desc.error;
+}
+
+static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
+			 size_t count, read_actor_t actor, void *target)
+{
+	read_descriptor_t desc;
+
+	if (!count)
+		return 0;
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+
+	do_shmem_file_read(in_file, ppos, &desc, actor);
+	if (desc.written)
+		return desc.written;
+	return desc.error;
+}
+
+static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+	buf->f_type = TMPFS_MAGIC;
+	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_namelen = NAME_MAX;
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		buf->f_blocks = sbinfo->max_blocks;
+		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+		buf->f_files = sbinfo->max_inodes;
+		buf->f_ffree = sbinfo->free_inodes;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+	/* else leave those fields 0 like simple_statfs */
+	return 0;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+static int
+shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
+	int error = -ENOSPC;
+
+	if (inode) {
+		if (dir->i_mode & S_ISGID) {
+			inode->i_gid = dir->i_gid;
+			if (S_ISDIR(mode))
+				inode->i_mode |= S_ISGID;
+		}
+		dir->i_size += BOGO_DIRENT_SIZE;
+		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+		d_instantiate(dentry, inode);
+		dget(dentry); /* Extra count - pin the dentry in core */
+		error = 0;
+	}
+	return error;
+}
+
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int error;
+
+	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+		return error;
+	dir->i_nlink++;
+	return 0;
+}
+
+static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/*
+ * Link a file..
+ */
+static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+	/*
+	 * No ordinary (disk based) filesystem counts links as inodes;
+	 * but each new link needs a new dentry, pinning lowmem, and
+	 * tmpfs dentries cannot be pruned until they are unlinked.
+	 */
+	if (sbinfo) {
+		spin_lock(&sbinfo->stat_lock);
+		if (!sbinfo->free_inodes) {
+			spin_unlock(&sbinfo->stat_lock);
+			return -ENOSPC;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	dir->i_size += BOGO_DIRENT_SIZE;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink++;
+	atomic_inc(&inode->i_count);	/* New dentry reference */
+	dget(dentry);		/* Extra pinning count for the created dentry */
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static int shmem_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+		if (sbinfo) {
+			spin_lock(&sbinfo->stat_lock);
+			sbinfo->free_inodes++;
+			spin_unlock(&sbinfo->stat_lock);
+		}
+	}
+
+	dir->i_size -= BOGO_DIRENT_SIZE;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	inode->i_nlink--;
+	dput(dentry);	/* Undo the count from "create" - this does all the work */
+	return 0;
+}
+
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	if (!simple_empty(dentry))
+		return -ENOTEMPTY;
+
+	dir->i_nlink--;
+	return shmem_unlink(dir, dentry);
+}
+
+/*
+ * The VFS layer already does all the dentry stuff for rename,
+ * we just have to decrement the usage count for the target if
+ * it exists so that the VFS layer correctly free's it when it
+ * gets overwritten.
+ */
+static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	int they_are_dirs = S_ISDIR(inode->i_mode);
+
+	if (!simple_empty(new_dentry))
+		return -ENOTEMPTY;
+
+	if (new_dentry->d_inode) {
+		(void) shmem_unlink(new_dir, new_dentry);
+		if (they_are_dirs)
+			old_dir->i_nlink--;
+	} else if (they_are_dirs) {
+		old_dir->i_nlink--;
+		new_dir->i_nlink++;
+	}
+
+	old_dir->i_size -= BOGO_DIRENT_SIZE;
+	new_dir->i_size += BOGO_DIRENT_SIZE;
+	old_dir->i_ctime = old_dir->i_mtime =
+	new_dir->i_ctime = new_dir->i_mtime =
+	inode->i_ctime = CURRENT_TIME;
+	return 0;
+}
+
+static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int error;
+	int len;
+	struct inode *inode;
+	struct page *page = NULL;
+	char *kaddr;
+	struct shmem_inode_info *info;
+
+	len = strlen(symname) + 1;
+	if (len > PAGE_CACHE_SIZE)
+		return -ENAMETOOLONG;
+
+	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+	if (!inode)
+		return -ENOSPC;
+
+	info = SHMEM_I(inode);
+	inode->i_size = len-1;
+	if (len <= (char *)inode - (char *)info) {
+		/* do it inline */
+		memcpy(info, symname, len);
+		inode->i_op = &shmem_symlink_inline_operations;
+	} else {
+		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+		if (error) {
+			iput(inode);
+			return error;
+		}
+		inode->i_op = &shmem_symlink_inode_operations;
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(kaddr, symname, len);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		page_cache_release(page);
+	}
+	if (dir->i_mode & S_ISGID)
+		inode->i_gid = dir->i_gid;
+	dir->i_size += BOGO_DIRENT_SIZE;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	return 0;
+}
+
+static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+	return 0;
+}
+
+static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct page *page = NULL;
+	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+	return 0;
+}
+
+static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
+{
+	if (!IS_ERR(nd_get_link(nd))) {
+		struct page *page;
+
+		page = find_get_page(dentry->d_inode->i_mapping, 0);
+		if (!page)
+			BUG();
+		kunmap(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		page_cache_release(page);
+	}
+}
+
+static struct inode_operations shmem_symlink_inline_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= shmem_follow_link_inline,
+#ifdef CONFIG_TMPFS_XATTR
+	.setxattr       = generic_setxattr,
+	.getxattr       = generic_getxattr,
+	.listxattr      = generic_listxattr,
+	.removexattr    = generic_removexattr,
+#endif
+};
+
+static struct inode_operations shmem_symlink_inode_operations = {
+	.truncate	= shmem_truncate,
+	.readlink	= generic_readlink,
+	.follow_link	= shmem_follow_link,
+	.put_link	= shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+	.setxattr       = generic_setxattr,
+	.getxattr       = generic_getxattr,
+	.listxattr      = generic_listxattr,
+	.removexattr    = generic_removexattr,
+#endif
+};
+
+static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+{
+	char *this_char, *value, *rest;
+
+	while ((this_char = strsep(&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
+		if ((value = strchr(this_char,'=')) != NULL) {
+			*value++ = 0;
+		} else {
+			printk(KERN_ERR
+			    "tmpfs: No value for mount option '%s'\n",
+			    this_char);
+			return 1;
+		}
+
+		if (!strcmp(this_char,"size")) {
+			unsigned long long size;
+			size = memparse(value,&rest);
+			if (*rest == '%') {
+				size <<= PAGE_SHIFT;
+				size *= totalram_pages;
+				do_div(size, 100);
+				rest++;
+			}
+			if (*rest)
+				goto bad_val;
+			*blocks = size >> PAGE_CACHE_SHIFT;
+		} else if (!strcmp(this_char,"nr_blocks")) {
+			*blocks = memparse(value,&rest);
+			if (*rest)
+				goto bad_val;
+		} else if (!strcmp(this_char,"nr_inodes")) {
+			*inodes = memparse(value,&rest);
+			if (*rest)
+				goto bad_val;
+		} else if (!strcmp(this_char,"mode")) {
+			if (!mode)
+				continue;
+			*mode = simple_strtoul(value,&rest,8);
+			if (*rest)
+				goto bad_val;
+		} else if (!strcmp(this_char,"uid")) {
+			if (!uid)
+				continue;
+			*uid = simple_strtoul(value,&rest,0);
+			if (*rest)
+				goto bad_val;
+		} else if (!strcmp(this_char,"gid")) {
+			if (!gid)
+				continue;
+			*gid = simple_strtoul(value,&rest,0);
+			if (*rest)
+				goto bad_val;
+		} else {
+			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
+			       this_char);
+			return 1;
+		}
+	}
+	return 0;
+
+bad_val:
+	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+	       value, this_char);
+	return 1;
+
+}
+
+static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	unsigned long max_blocks = 0;
+	unsigned long max_inodes = 0;
+
+	if (sbinfo) {
+		max_blocks = sbinfo->max_blocks;
+		max_inodes = sbinfo->max_inodes;
+	}
+	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+		return -EINVAL;
+	/* Keep it simple: disallow limited <-> unlimited remount */
+	if ((max_blocks || max_inodes) == !sbinfo)
+		return -EINVAL;
+	/* But allow the pointless unlimited -> unlimited remount */
+	if (!sbinfo)
+		return 0;
+	return shmem_set_size(sbinfo, max_blocks, max_inodes);
+}
+#endif
+
+static void shmem_put_super(struct super_block *sb)
+{
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+#ifdef CONFIG_TMPFS_XATTR
+static struct xattr_handler *shmem_xattr_handlers[];
+#else
+#define shmem_xattr_handlers NULL
+#endif
+
+static int shmem_fill_super(struct super_block *sb,
+			    void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+	int mode   = S_IRWXUGO | S_ISVTX;
+	uid_t uid = current->fsuid;
+	gid_t gid = current->fsgid;
+	int err = -ENOMEM;
+
+#ifdef CONFIG_TMPFS
+	unsigned long blocks = 0;
+	unsigned long inodes = 0;
+
+	/*
+	 * Per default we only allow half of the physical ram per
+	 * tmpfs instance, limiting inodes to one per page of lowmem;
+	 * but the internal instance is left unlimited.
+	 */
+	if (!(sb->s_flags & MS_NOUSER)) {
+		blocks = totalram_pages / 2;
+		inodes = totalram_pages - totalhigh_pages;
+		if (inodes > blocks)
+			inodes = blocks;
+
+		if (shmem_parse_options(data, &mode,
+					&uid, &gid, &blocks, &inodes))
+			return -EINVAL;
+	}
+
+	if (blocks || inodes) {
+		struct shmem_sb_info *sbinfo;
+		sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
+		if (!sbinfo)
+			return -ENOMEM;
+		sb->s_fs_info = sbinfo;
+		spin_lock_init(&sbinfo->stat_lock);
+		sbinfo->max_blocks = blocks;
+		sbinfo->free_blocks = blocks;
+		sbinfo->max_inodes = inodes;
+		sbinfo->free_inodes = inodes;
+	}
+	sb->s_xattr = shmem_xattr_handlers;
+#else
+	sb->s_flags |= MS_NOUSER;
+#endif
+
+	sb->s_maxbytes = SHMEM_MAX_BYTES;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = TMPFS_MAGIC;
+	sb->s_op = &shmem_ops;
+	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
+	if (!inode)
+		goto failed;
+	inode->i_uid = uid;
+	inode->i_gid = gid;
+	root = d_alloc_root(inode);
+	if (!root)
+		goto failed_iput;
+	sb->s_root = root;
+	return 0;
+
+failed_iput:
+	iput(inode);
+failed:
+	shmem_put_super(sb);
+	return err;
+}
+
+static kmem_cache_t *shmem_inode_cachep;
+
+static struct inode *shmem_alloc_inode(struct super_block *sb)
+{
+	struct shmem_inode_info *p;
+	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
+	if (!p)
+		return NULL;
+	return &p->vfs_inode;
+}
+
+static void shmem_destroy_inode(struct inode *inode)
+{
+	if ((inode->i_mode & S_IFMT) == S_IFREG) {
+		/* only struct inode is valid if it's an inline symlink */
+		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+	}
+	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		inode_init_once(&p->vfs_inode);
+	}
+}
+
+static int init_inodecache(void)
+{
+	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
+				sizeof(struct shmem_inode_info),
+				0, 0, init_once, NULL);
+	if (shmem_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	if (kmem_cache_destroy(shmem_inode_cachep))
+		printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
+}
+
+static struct address_space_operations shmem_aops = {
+	.writepage	= shmem_writepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+#ifdef CONFIG_TMPFS
+	.prepare_write	= shmem_prepare_write,
+	.commit_write	= simple_commit_write,
+#endif
+};
+
+static struct file_operations shmem_file_operations = {
+	.mmap		= shmem_mmap,
+#ifdef CONFIG_TMPFS
+	.llseek		= generic_file_llseek,
+	.read		= shmem_file_read,
+	.write		= shmem_file_write,
+	.fsync		= simple_sync_file,
+	.sendfile	= shmem_file_sendfile,
+#endif
+};
+
+static struct inode_operations shmem_inode_operations = {
+	.truncate	= shmem_truncate,
+	.setattr	= shmem_notify_change,
+#ifdef CONFIG_TMPFS_XATTR
+	.setxattr       = generic_setxattr,
+	.getxattr       = generic_getxattr,
+	.listxattr      = generic_listxattr,
+	.removexattr    = generic_removexattr,
+#endif
+};
+
+static struct inode_operations shmem_dir_inode_operations = {
+#ifdef CONFIG_TMPFS
+	.create		= shmem_create,
+	.lookup		= simple_lookup,
+	.link		= shmem_link,
+	.unlink		= shmem_unlink,
+	.symlink	= shmem_symlink,
+	.mkdir		= shmem_mkdir,
+	.rmdir		= shmem_rmdir,
+	.mknod		= shmem_mknod,
+	.rename		= shmem_rename,
+#ifdef CONFIG_TMPFS_XATTR
+	.setxattr       = generic_setxattr,
+	.getxattr       = generic_getxattr,
+	.listxattr      = generic_listxattr,
+	.removexattr    = generic_removexattr,
+#endif
+#endif
+};
+
+static struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+static struct super_operations shmem_ops = {
+	.alloc_inode	= shmem_alloc_inode,
+	.destroy_inode	= shmem_destroy_inode,
+#ifdef CONFIG_TMPFS
+	.statfs		= shmem_statfs,
+	.remount_fs	= shmem_remount_fs,
+#endif
+	.delete_inode	= shmem_delete_inode,
+	.drop_inode	= generic_delete_inode,
+	.put_super	= shmem_put_super,
+};
+
+static struct vm_operations_struct shmem_vm_ops = {
+	.nopage		= shmem_nopage,
+	.populate	= shmem_populate,
+#ifdef CONFIG_NUMA
+	.set_policy     = shmem_set_policy,
+	.get_policy     = shmem_get_policy,
+#endif
+};
+
+
+#ifdef CONFIG_TMPFS_SECURITY
+
+static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
+					const char *name, size_t name_len)
+{
+	return security_inode_listsecurity(inode, list, list_len);
+}
+
+static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_getsecurity(inode, name, buffer, size);
+}
+
+static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_setsecurity(inode, name, value, size, flags);
+}
+
+static struct xattr_handler shmem_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= shmem_xattr_security_list,
+	.get	= shmem_xattr_security_get,
+	.set	= shmem_xattr_security_set,
+};
+
+#endif	/* CONFIG_TMPFS_SECURITY */
+
+#ifdef CONFIG_TMPFS_XATTR
+
+static struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_SECURITY
+	&shmem_xattr_security_handler,
+#endif
+	NULL
+};
+
+#endif	/* CONFIG_TMPFS_XATTR */
+
+static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+}
+
+static struct file_system_type tmpfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "tmpfs",
+	.get_sb		= shmem_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+static struct vfsmount *shm_mnt;
+
+static int __init init_tmpfs(void)
+{
+	int error;
+
+	error = init_inodecache();
+	if (error)
+		goto out3;
+
+	error = register_filesystem(&tmpfs_fs_type);
+	if (error) {
+		printk(KERN_ERR "Could not register tmpfs\n");
+		goto out2;
+	}
+#ifdef CONFIG_TMPFS
+	devfs_mk_dir("shm");
+#endif
+	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
+				tmpfs_fs_type.name, NULL);
+	if (IS_ERR(shm_mnt)) {
+		error = PTR_ERR(shm_mnt);
+		printk(KERN_ERR "Could not kern_mount tmpfs\n");
+		goto out1;
+	}
+	return 0;
+
+out1:
+	unregister_filesystem(&tmpfs_fs_type);
+out2:
+	destroy_inodecache();
+out3:
+	shm_mnt = ERR_PTR(error);
+	return error;
+}
+module_init(init_tmpfs)
+
+/*
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ *
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ *
+ */
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+{
+	int error;
+	struct file *file;
+	struct inode *inode;
+	struct dentry *dentry, *root;
+	struct qstr this;
+
+	if (IS_ERR(shm_mnt))
+		return (void *)shm_mnt;
+
+	if (size < 0 || size > SHMEM_MAX_BYTES)
+		return ERR_PTR(-EINVAL);
+
+	if (shmem_acct_size(flags, size))
+		return ERR_PTR(-ENOMEM);
+
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0; /* will go */
+	root = shm_mnt->mnt_root;
+	dentry = d_alloc(root, &this);
+	if (!dentry)
+		goto put_memory;
+
+	error = -ENFILE;
+	file = get_empty_filp();
+	if (!file)
+		goto put_dentry;
+
+	error = -ENOSPC;
+	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+	if (!inode)
+		goto close_file;
+
+	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+	d_instantiate(dentry, inode);
+	inode->i_size = size;
+	inode->i_nlink = 0;	/* It is unlinked */
+	file->f_vfsmnt = mntget(shm_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_op = &shmem_file_operations;
+	file->f_mode = FMODE_WRITE | FMODE_READ;
+	return file;
+
+close_file:
+	put_filp(file);
+put_dentry:
+	dput(dentry);
+put_memory:
+	shmem_unacct_size(flags, size);
+	return ERR_PTR(error);
+}
+
+/*
+ * shmem_zero_setup - setup a shared anonymous mapping
+ *
+ * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+	struct file *file;
+	loff_t size = vma->vm_end - vma->vm_start;
+
+	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = file;
+	vma->vm_ops = &shmem_vm_ops;
+	return 0;
+}
diff --git a/mm/slab.c b/mm/slab.c
new file mode 100644
index 000000000000..ec660d85ddd7
--- /dev/null
+++ b/mm/slab.c
@@ -0,0 +1,3060 @@
+/*
+ * linux/mm/slab.c
+ * Written by Mark Hemment, 1996/97.
+ * (markhe@nextd.demon.co.uk)
+ *
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
+ * Major cleanup, different bufctl logic, per-cpu arrays
+ *	(c) 2000 Manfred Spraul
+ *
+ * Cleanup, make the head arrays unconditional, preparation for NUMA
+ * 	(c) 2002 Manfred Spraul
+ *
+ * An implementation of the Slab Allocator as described in outline in;
+ *	UNIX Internals: The New Frontiers by Uresh Vahalia
+ *	Pub: Prentice Hall	ISBN 0-13-101908-2
+ * or with a little more detail in;
+ *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
+ *	Jeff Bonwick (Sun Microsystems).
+ *	Presented at: USENIX Summer 1994 Technical Conference
+ *
+ * The memory is organized in caches, one cache for each object type.
+ * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
+ * Each cache consists out of many slabs (they are small (usually one
+ * page long) and always contiguous), and each slab contains multiple
+ * initialized objects.
+ *
+ * This means, that your constructor is used only for newly allocated
+ * slabs and you must pass objects with the same intializations to
+ * kmem_cache_free.
+ *
+ * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
+ * normal). If you need a special memory type, then must create a new
+ * cache for that memory type.
+ *
+ * In order to reduce fragmentation, the slabs are sorted in 3 groups:
+ *   full slabs with 0 free objects
+ *   partial slabs
+ *   empty slabs with no allocated objects
+ *
+ * If partial slabs exist, then new allocations come from these slabs,
+ * otherwise from empty slabs or new slabs are allocated.
+ *
+ * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
+ * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
+ *
+ * Each cache has a short per-cpu head array, most allocs
+ * and frees go into that array, and if that array overflows, then 1/2
+ * of the entries in the array are given back into the global cache.
+ * The head array is strictly LIFO and should improve the cache hit rates.
+ * On SMP, it additionally reduces the spinlock operations.
+ *
+ * The c_cpuarray may not be read with enabled local interrupts - 
+ * it's changed with a smp_call_function().
+ *
+ * SMP synchronization:
+ *  constructors and destructors are called without any locking.
+ *  Several members in kmem_cache_t and struct slab never change, they
+ *	are accessed without any locking.
+ *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ *  	and local interrupts are disabled so slab code is preempt-safe.
+ *  The non-constant members are protected with a per-cache irq spinlock.
+ *
+ * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
+ * in 2000 - many ideas in the current implementation are derived from
+ * his patch.
+ *
+ * Further notes from the original documentation:
+ *
+ * 11 April '97.  Started multi-threading - markhe
+ *	The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *	The sem is only needed when accessing/extending the cache-chain, which
+ *	can never happen inside an interrupt (kmem_cache_create(),
+ *	kmem_cache_shrink() and kmem_cache_reap()).
+ *
+ *	At present, each engine can be growing a cache.  This should be blocked.
+ *
+ */
+
+#include	<linux/config.h>
+#include	<linux/slab.h>
+#include	<linux/mm.h>
+#include	<linux/swap.h>
+#include	<linux/cache.h>
+#include	<linux/interrupt.h>
+#include	<linux/init.h>
+#include	<linux/compiler.h>
+#include	<linux/seq_file.h>
+#include	<linux/notifier.h>
+#include	<linux/kallsyms.h>
+#include	<linux/cpu.h>
+#include	<linux/sysctl.h>
+#include	<linux/module.h>
+#include	<linux/rcupdate.h>
+
+#include	<asm/uaccess.h>
+#include	<asm/cacheflush.h>
+#include	<asm/tlbflush.h>
+#include	<asm/page.h>
+
+/*
+ * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
+ *		  SLAB_RED_ZONE & SLAB_POISON.
+ *		  0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS	- 1 to collect stats for /proc/slabinfo.
+ *		  0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define	DEBUG		1
+#define	STATS		1
+#define	FORCED_DEBUG	1
+#else
+#define	DEBUG		0
+#define	STATS		0
+#define	FORCED_DEBUG	0
+#endif
+
+
+/* Shouldn't this be in a header file somewhere? */
+#define	BYTES_PER_WORD		sizeof(void *)
+
+#ifndef cache_line_size
+#define cache_line_size()	L1_CACHE_BYTES
+#endif
+
+#ifndef ARCH_KMALLOC_MINALIGN
+/*
+ * Enforce a minimum alignment for the kmalloc caches.
+ * Usually, the kmalloc caches are cache_line_size() aligned, except when
+ * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
+ * Some archs want to perform DMA into kmalloc caches and need a guaranteed
+ * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
+ * Note that this flag disables some debug features.
+ */
+#define ARCH_KMALLOC_MINALIGN 0
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+/*
+ * Enforce a minimum alignment for all caches.
+ * Intended for archs that get misalignment faults even for BYTES_PER_WORD
+ * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
+ * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
+ * some debug features.
+ */
+#define ARCH_SLAB_MINALIGN 0
+#endif
+
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+
+/* Legal flag mask for kmem_cache_create(). */
+#if DEBUG
+# define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
+			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
+			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
+#else
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
+			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+			 SLAB_DESTROY_BY_RCU)
+#endif
+
+/*
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementation relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+ */
+
+#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
+#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
+#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
+
+/* Max number of objs-per-slab for caches which use off-slab slabs.
+ * Needed to avoid a possible looping condition in cache_grow().
+ */
+static unsigned long offslab_limit;
+
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+	struct list_head	list;
+	unsigned long		colouroff;
+	void			*s_mem;		/* including colour offset */
+	unsigned int		inuse;		/* num of objs active in slab */
+	kmem_bufctl_t		free;
+};
+
+/*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU.  This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking.  We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ */
+struct slab_rcu {
+	struct rcu_head		head;
+	kmem_cache_t		*cachep;
+	void			*addr;
+};
+
+/*
+ * struct array_cache
+ *
+ * Per cpu structures
+ * Purpose:
+ * - LIFO ordering, to hand out cache-warm objects from _alloc
+ * - reduce the number of linked list operations
+ * - reduce spinlock operations
+ *
+ * The limit is stored in the per-cpu structure to reduce the data cache
+ * footprint.
+ *
+ */
+struct array_cache {
+	unsigned int avail;
+	unsigned int limit;
+	unsigned int batchcount;
+	unsigned int touched;
+};
+
+/* bootstrap: The caches do not work without cpuarrays anymore,
+ * but the cpuarrays are allocated from the generic caches...
+ */
+#define BOOT_CPUCACHE_ENTRIES	1
+struct arraycache_init {
+	struct array_cache cache;
+	void * entries[BOOT_CPUCACHE_ENTRIES];
+};
+
+/*
+ * The slab lists of all objects.
+ * Hopefully reduce the internal fragmentation
+ * NUMA: The spinlock could be moved from the kmem_cache_t
+ * into this structure, too. Figure out what causes
+ * fewer cross-node spinlock operations.
+ */
+struct kmem_list3 {
+	struct list_head	slabs_partial;	/* partial list first, better asm code */
+	struct list_head	slabs_full;
+	struct list_head	slabs_free;
+	unsigned long	free_objects;
+	int		free_touched;
+	unsigned long	next_reap;
+	struct array_cache	*shared;
+};
+
+#define LIST3_INIT(parent) \
+	{ \
+		.slabs_full	= LIST_HEAD_INIT(parent.slabs_full), \
+		.slabs_partial	= LIST_HEAD_INIT(parent.slabs_partial), \
+		.slabs_free	= LIST_HEAD_INIT(parent.slabs_free) \
+	}
+#define list3_data(cachep) \
+	(&(cachep)->lists)
+
+/* NUMA: per-node */
+#define list3_data_ptr(cachep, ptr) \
+		list3_data(cachep)
+
+/*
+ * kmem_cache_t
+ *
+ * manages a cache.
+ */
+	
+struct kmem_cache_s {
+/* 1) per-cpu data, touched during every alloc/free */
+	struct array_cache	*array[NR_CPUS];
+	unsigned int		batchcount;
+	unsigned int		limit;
+/* 2) touched by every alloc & free from the backend */
+	struct kmem_list3	lists;
+	/* NUMA: kmem_3list_t	*nodelists[MAX_NUMNODES] */
+	unsigned int		objsize;
+	unsigned int	 	flags;	/* constant flags */
+	unsigned int		num;	/* # of objs per slab */
+	unsigned int		free_limit; /* upper limit of objects in the lists */
+	spinlock_t		spinlock;
+
+/* 3) cache_grow/shrink */
+	/* order of pgs per slab (2^n) */
+	unsigned int		gfporder;
+
+	/* force GFP flags, e.g. GFP_DMA */
+	unsigned int		gfpflags;
+
+	size_t			colour;		/* cache colouring range */
+	unsigned int		colour_off;	/* colour offset */
+	unsigned int		colour_next;	/* cache colouring */
+	kmem_cache_t		*slabp_cache;
+	unsigned int		slab_size;
+	unsigned int		dflags;		/* dynamic flags */
+
+	/* constructor func */
+	void (*ctor)(void *, kmem_cache_t *, unsigned long);
+
+	/* de-constructor func */
+	void (*dtor)(void *, kmem_cache_t *, unsigned long);
+
+/* 4) cache creation/removal */
+	const char		*name;
+	struct list_head	next;
+
+/* 5) statistics */
+#if STATS
+	unsigned long		num_active;
+	unsigned long		num_allocations;
+	unsigned long		high_mark;
+	unsigned long		grown;
+	unsigned long		reaped;
+	unsigned long 		errors;
+	unsigned long		max_freeable;
+	unsigned long		node_allocs;
+	atomic_t		allochit;
+	atomic_t		allocmiss;
+	atomic_t		freehit;
+	atomic_t		freemiss;
+#endif
+#if DEBUG
+	int			dbghead;
+	int			reallen;
+#endif
+};
+
+#define CFLGS_OFF_SLAB		(0x80000000UL)
+#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
+
+#define BATCHREFILL_LIMIT	16
+/* Optimization question: fewer reaps means less 
+ * probability for unnessary cpucache drain/refill cycles.
+ *
+ * OTHO the cpuarrays can contain lots of objects,
+ * which could lock up otherwise freeable slabs.
+ */
+#define REAPTIMEOUT_CPUC	(2*HZ)
+#define REAPTIMEOUT_LIST3	(4*HZ)
+
+#if STATS
+#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
+#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
+#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
+#define	STATS_INC_GROWN(x)	((x)->grown++)
+#define	STATS_INC_REAPED(x)	((x)->reaped++)
+#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
+					(x)->high_mark = (x)->num_active; \
+				} while (0)
+#define	STATS_INC_ERR(x)	((x)->errors++)
+#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
+#define	STATS_SET_FREEABLE(x, i) \
+				do { if ((x)->max_freeable < i) \
+					(x)->max_freeable = i; \
+				} while (0)
+
+#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
+#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
+#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
+#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
+#else
+#define	STATS_INC_ACTIVE(x)	do { } while (0)
+#define	STATS_DEC_ACTIVE(x)	do { } while (0)
+#define	STATS_INC_ALLOCED(x)	do { } while (0)
+#define	STATS_INC_GROWN(x)	do { } while (0)
+#define	STATS_INC_REAPED(x)	do { } while (0)
+#define	STATS_SET_HIGH(x)	do { } while (0)
+#define	STATS_INC_ERR(x)	do { } while (0)
+#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
+#define	STATS_SET_FREEABLE(x, i) \
+				do { } while (0)
+
+#define STATS_INC_ALLOCHIT(x)	do { } while (0)
+#define STATS_INC_ALLOCMISS(x)	do { } while (0)
+#define STATS_INC_FREEHIT(x)	do { } while (0)
+#define STATS_INC_FREEMISS(x)	do { } while (0)
+#endif
+
+#if DEBUG
+/* Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
+#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/* memory layout of objects:
+ * 0		: objp
+ * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
+ * 		the end of an object is aligned with the end of the real
+ * 		allocation. Catches writes behind the end of the allocation.
+ * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
+ * 		redzone word.
+ * cachep->dbghead: The real object.
+ * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ */
+static int obj_dbghead(kmem_cache_t *cachep)
+{
+	return cachep->dbghead;
+}
+
+static int obj_reallen(kmem_cache_t *cachep)
+{
+	return cachep->reallen;
+}
+
+static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
+{
+	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+	return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
+}
+
+static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
+{
+	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+	if (cachep->flags & SLAB_STORE_USER)
+		return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
+	return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+}
+
+static void **dbg_userword(kmem_cache_t *cachep, void *objp)
+{
+	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
+	return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+}
+
+#else
+
+#define obj_dbghead(x)			0
+#define obj_reallen(cachep)		(cachep->objsize)
+#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
+#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
+#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
+
+#endif
+
+/*
+ * Maximum size of an obj (in 2^order pages)
+ * and absolute limit for the gfp order.
+ */
+#if defined(CONFIG_LARGE_ALLOCS)
+#define	MAX_OBJ_ORDER	13	/* up to 32Mb */
+#define	MAX_GFP_ORDER	13	/* up to 32Mb */
+#elif defined(CONFIG_MMU)
+#define	MAX_OBJ_ORDER	5	/* 32 pages */
+#define	MAX_GFP_ORDER	5	/* 32 pages */
+#else
+#define	MAX_OBJ_ORDER	8	/* up to 1Mb */
+#define	MAX_GFP_ORDER	8	/* up to 1Mb */
+#endif
+
+/*
+ * Do not go above this order unless 0 objects fit into the slab.
+ */
+#define	BREAK_GFP_ORDER_HI	1
+#define	BREAK_GFP_ORDER_LO	0
+static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+
+/* Macros for storing/retrieving the cachep and or slab from the
+ * global 'mem_map'. These are used to find the slab an obj belongs to.
+ * With kfree(), these are used to find the cache which an obj belongs to.
+ */
+#define	SET_PAGE_CACHE(pg,x)  ((pg)->lru.next = (struct list_head *)(x))
+#define	GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->lru.next)
+#define	SET_PAGE_SLAB(pg,x)   ((pg)->lru.prev = (struct list_head *)(x))
+#define	GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->lru.prev)
+
+/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+struct cache_sizes malloc_sizes[] = {
+#define CACHE(x) { .cs_size = (x) },
+#include <linux/kmalloc_sizes.h>
+	CACHE(ULONG_MAX)
+#undef CACHE
+};
+EXPORT_SYMBOL(malloc_sizes);
+
+/* Must match cache_sizes above. Out of line to keep cache footprint low. */
+struct cache_names {
+	char *name;
+	char *name_dma;
+};
+
+static struct cache_names __initdata cache_names[] = {
+#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
+#include <linux/kmalloc_sizes.h>
+	{ NULL, }
+#undef CACHE
+};
+
+static struct arraycache_init initarray_cache __initdata =
+	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+static struct arraycache_init initarray_generic =
+	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+
+/* internal cache of cache description objs */
+static kmem_cache_t cache_cache = {
+	.lists		= LIST3_INIT(cache_cache.lists),
+	.batchcount	= 1,
+	.limit		= BOOT_CPUCACHE_ENTRIES,
+	.objsize	= sizeof(kmem_cache_t),
+	.flags		= SLAB_NO_REAP,
+	.spinlock	= SPIN_LOCK_UNLOCKED,
+	.name		= "kmem_cache",
+#if DEBUG
+	.reallen	= sizeof(kmem_cache_t),
+#endif
+};
+
+/* Guard access to the cache-chain. */
+static struct semaphore	cache_chain_sem;
+static struct list_head cache_chain;
+
+/*
+ * vm_enough_memory() looks at this to determine how many
+ * slab-allocated pages are possibly freeable under pressure
+ *
+ * SLAB_RECLAIM_ACCOUNT turns this on per-slab
+ */
+atomic_t slab_reclaim_pages;
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+	NONE,
+	PARTIAL,
+	FULL
+} g_cpucache_up;
+
+static DEFINE_PER_CPU(struct work_struct, reap_work);
+
+static void free_block(kmem_cache_t* cachep, void** objpp, int len);
+static void enable_cpucache (kmem_cache_t *cachep);
+static void cache_reap (void *unused);
+
+static inline void **ac_entry(struct array_cache *ac)
+{
+	return (void**)(ac+1);
+}
+
+static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+{
+	return cachep->array[smp_processor_id()];
+}
+
+static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+{
+	struct cache_sizes *csizep = malloc_sizes;
+
+#if DEBUG
+	/* This happens if someone tries to call
+ 	* kmem_cache_create(), or __kmalloc(), before
+ 	* the generic caches are initialized.
+ 	*/
+	BUG_ON(csizep->cs_cachep == NULL);
+#endif
+	while (size > csizep->cs_size)
+		csizep++;
+
+	/*
+	 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
+	 * has cs_{dma,}cachep==NULL. Thus no special case
+	 * for large kmalloc calls required.
+	 */
+	if (unlikely(gfpflags & GFP_DMA))
+		return csizep->cs_dmacachep;
+	return csizep->cs_cachep;
+}
+
+/* Cal the num objs, wastage, and bytes left over for a given slab size. */
+static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
+		 int flags, size_t *left_over, unsigned int *num)
+{
+	int i;
+	size_t wastage = PAGE_SIZE<<gfporder;
+	size_t extra = 0;
+	size_t base = 0;
+
+	if (!(flags & CFLGS_OFF_SLAB)) {
+		base = sizeof(struct slab);
+		extra = sizeof(kmem_bufctl_t);
+	}
+	i = 0;
+	while (i*size + ALIGN(base+i*extra, align) <= wastage)
+		i++;
+	if (i > 0)
+		i--;
+
+	if (i > SLAB_LIMIT)
+		i = SLAB_LIMIT;
+
+	*num = i;
+	wastage -= i*size;
+	wastage -= ALIGN(base+i*extra, align);
+	*left_over = wastage;
+}
+
+#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
+
+static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
+{
+	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+		function, cachep->name, msg);
+	dump_stack();
+}
+
+/*
+ * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
+ * via the workqueue/eventd.
+ * Add the CPU number into the expiration time to minimize the possibility of
+ * the CPUs getting into lockstep and contending for the global cache chain
+ * lock.
+ */
+static void __devinit start_cpu_timer(int cpu)
+{
+	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+
+	/*
+	 * When this gets called from do_initcalls via cpucache_init(),
+	 * init_workqueues() has already run, so keventd will be setup
+	 * at that time.
+	 */
+	if (keventd_up() && reap_work->func == NULL) {
+		INIT_WORK(reap_work, cache_reap, NULL);
+		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+	}
+}
+
+static struct array_cache *alloc_arraycache(int cpu, int entries,
+						int batchcount)
+{
+	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+	struct array_cache *nc = NULL;
+
+	if (cpu != -1) {
+		kmem_cache_t *cachep;
+		cachep = kmem_find_general_cachep(memsize, GFP_KERNEL);
+		if (cachep)
+			nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu));
+	}
+	if (!nc)
+		nc = kmalloc(memsize, GFP_KERNEL);
+	if (nc) {
+		nc->avail = 0;
+		nc->limit = entries;
+		nc->batchcount = batchcount;
+		nc->touched = 0;
+	}
+	return nc;
+}
+
+static int __devinit cpuup_callback(struct notifier_block *nfb,
+				  unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	kmem_cache_t* cachep;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		down(&cache_chain_sem);
+		list_for_each_entry(cachep, &cache_chain, next) {
+			struct array_cache *nc;
+
+			nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+			if (!nc)
+				goto bad;
+
+			spin_lock_irq(&cachep->spinlock);
+			cachep->array[cpu] = nc;
+			cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+						+ cachep->num;
+			spin_unlock_irq(&cachep->spinlock);
+
+		}
+		up(&cache_chain_sem);
+		break;
+	case CPU_ONLINE:
+		start_cpu_timer(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		/* fall thru */
+	case CPU_UP_CANCELED:
+		down(&cache_chain_sem);
+
+		list_for_each_entry(cachep, &cache_chain, next) {
+			struct array_cache *nc;
+
+			spin_lock_irq(&cachep->spinlock);
+			/* cpu is dead; no one can alloc from it. */
+			nc = cachep->array[cpu];
+			cachep->array[cpu] = NULL;
+			cachep->free_limit -= cachep->batchcount;
+			free_block(cachep, ac_entry(nc), nc->avail);
+			spin_unlock_irq(&cachep->spinlock);
+			kfree(nc);
+		}
+		up(&cache_chain_sem);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+bad:
+	up(&cache_chain_sem);
+	return NOTIFY_BAD;
+}
+
+static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+
+/* Initialisation.
+ * Called after the gfp() functions have been enabled, and before smp_init().
+ */
+void __init kmem_cache_init(void)
+{
+	size_t left_over;
+	struct cache_sizes *sizes;
+	struct cache_names *names;
+
+	/*
+	 * Fragmentation resistance on low memory - only use bigger
+	 * page orders on machines with more than 32MB of memory.
+	 */
+	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+
+	
+	/* Bootstrap is tricky, because several objects are allocated
+	 * from caches that do not exist yet:
+	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
+	 *    structures of all caches, except cache_cache itself: cache_cache
+	 *    is statically allocated.
+	 *    Initially an __init data area is used for the head array, it's
+	 *    replaced with a kmalloc allocated array at the end of the bootstrap.
+	 * 2) Create the first kmalloc cache.
+	 *    The kmem_cache_t for the new cache is allocated normally. An __init
+	 *    data area is used for the head array.
+	 * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+	 * 4) Replace the __init data head arrays for cache_cache and the first
+	 *    kmalloc cache with kmalloc allocated arrays.
+	 * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+	 */
+
+	/* 1) create the cache_cache */
+	init_MUTEX(&cache_chain_sem);
+	INIT_LIST_HEAD(&cache_chain);
+	list_add(&cache_cache.next, &cache_chain);
+	cache_cache.colour_off = cache_line_size();
+	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+
+	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+
+	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+				&left_over, &cache_cache.num);
+	if (!cache_cache.num)
+		BUG();
+
+	cache_cache.colour = left_over/cache_cache.colour_off;
+	cache_cache.colour_next = 0;
+	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+				sizeof(struct slab), cache_line_size());
+
+	/* 2+3) create the kmalloc caches */
+	sizes = malloc_sizes;
+	names = cache_names;
+
+	while (sizes->cs_size != ULONG_MAX) {
+		/* For performance, all the general caches are L1 aligned.
+		 * This should be particularly beneficial on SMP boxes, as it
+		 * eliminates "false sharing".
+		 * Note for systems short on memory removing the alignment will
+		 * allow tighter packing of the smaller caches. */
+		sizes->cs_cachep = kmem_cache_create(names->name,
+			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+			(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
+		/* Inc off-slab bufctl limit until the ceiling is hit. */
+		if (!(OFF_SLAB(sizes->cs_cachep))) {
+			offslab_limit = sizes->cs_size-sizeof(struct slab);
+			offslab_limit /= sizeof(kmem_bufctl_t);
+		}
+
+		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
+			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+			(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
+			NULL, NULL);
+
+		sizes++;
+		names++;
+	}
+	/* 4) Replace the bootstrap head arrays */
+	{
+		void * ptr;
+		
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		local_irq_disable();
+		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
+		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+		cache_cache.array[smp_processor_id()] = ptr;
+		local_irq_enable();
+	
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		local_irq_disable();
+		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
+		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+				sizeof(struct arraycache_init));
+		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+		local_irq_enable();
+	}
+
+	/* 5) resize the head arrays to their final sizes */
+	{
+		kmem_cache_t *cachep;
+		down(&cache_chain_sem);
+		list_for_each_entry(cachep, &cache_chain, next)
+			enable_cpucache(cachep);
+		up(&cache_chain_sem);
+	}
+
+	/* Done! */
+	g_cpucache_up = FULL;
+
+	/* Register a cpu startup notifier callback
+	 * that initializes ac_data for all new cpus
+	 */
+	register_cpu_notifier(&cpucache_notifier);
+	
+
+	/* The reap timers are started later, with a module init call:
+	 * That part of the kernel is not yet operational.
+	 */
+}
+
+static int __init cpucache_init(void)
+{
+	int cpu;
+
+	/* 
+	 * Register the timers that return unneeded
+	 * pages to gfp.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (cpu_online(cpu))
+			start_cpu_timer(cpu);
+	}
+
+	return 0;
+}
+
+__initcall(cpucache_init);
+
+/*
+ * Interface to system's page allocator. No need to hold the cache-lock.
+ *
+ * If we requested dmaable memory, we will get it. Even if we
+ * did not request dmaable memory, we might get it, but that
+ * would be relatively rare and ignorable.
+ */
+static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+{
+	struct page *page;
+	void *addr;
+	int i;
+
+	flags |= cachep->gfpflags;
+	if (likely(nodeid == -1)) {
+		page = alloc_pages(flags, cachep->gfporder);
+	} else {
+		page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+	}
+	if (!page)
+		return NULL;
+	addr = page_address(page);
+
+	i = (1 << cachep->gfporder);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		atomic_add(i, &slab_reclaim_pages);
+	add_page_state(nr_slab, i);
+	while (i--) {
+		SetPageSlab(page);
+		page++;
+	}
+	return addr;
+}
+
+/*
+ * Interface to system's page release.
+ */
+static void kmem_freepages(kmem_cache_t *cachep, void *addr)
+{
+	unsigned long i = (1<<cachep->gfporder);
+	struct page *page = virt_to_page(addr);
+	const unsigned long nr_freed = i;
+
+	while (i--) {
+		if (!TestClearPageSlab(page))
+			BUG();
+		page++;
+	}
+	sub_page_state(nr_slab, nr_freed);
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += nr_freed;
+	free_pages((unsigned long)addr, cachep->gfporder);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
+		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	kmem_cache_t *cachep = slab_rcu->cachep;
+
+	kmem_freepages(cachep, slab_rcu->addr);
+	if (OFF_SLAB(cachep))
+		kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
+#if DEBUG
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
+				unsigned long caller)
+{
+	int size = obj_reallen(cachep);
+
+	addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+
+	if (size < 5*sizeof(unsigned long))
+		return;
+
+	*addr++=0x12345678;
+	*addr++=caller;
+	*addr++=smp_processor_id();
+	size -= 3*sizeof(unsigned long);
+	{
+		unsigned long *sptr = &caller;
+		unsigned long svalue;
+
+		while (!kstack_end(sptr)) {
+			svalue = *sptr++;
+			if (kernel_text_address(svalue)) {
+				*addr++=svalue;
+				size -= sizeof(unsigned long);
+				if (size <= sizeof(unsigned long))
+					break;
+			}
+		}
+
+	}
+	*addr++=0x87654321;
+}
+#endif
+
+static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
+{
+	int size = obj_reallen(cachep);
+	addr = &((char*)addr)[obj_dbghead(cachep)];
+
+	memset(addr, val, size);
+	*(unsigned char *)(addr+size-1) = POISON_END;
+}
+
+static void dump_line(char *data, int offset, int limit)
+{
+	int i;
+	printk(KERN_ERR "%03x:", offset);
+	for (i=0;i<limit;i++) {
+		printk(" %02x", (unsigned char)data[offset+i]);
+	}
+	printk("\n");
+}
+#endif
+
+#if DEBUG
+
+static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
+{
+	int i, size;
+	char *realobj;
+
+	if (cachep->flags & SLAB_RED_ZONE) {
+		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
+			*dbg_redzone1(cachep, objp),
+			*dbg_redzone2(cachep, objp));
+	}
+
+	if (cachep->flags & SLAB_STORE_USER) {
+		printk(KERN_ERR "Last user: [<%p>]",
+				*dbg_userword(cachep, objp));
+		print_symbol("(%s)",
+				(unsigned long)*dbg_userword(cachep, objp));
+		printk("\n");
+	}
+	realobj = (char*)objp+obj_dbghead(cachep);
+	size = obj_reallen(cachep);
+	for (i=0; i<size && lines;i+=16, lines--) {
+		int limit;
+		limit = 16;
+		if (i+limit > size)
+			limit = size-i;
+		dump_line(realobj, i, limit);
+	}
+}
+
+static void check_poison_obj(kmem_cache_t *cachep, void *objp)
+{
+	char *realobj;
+	int size, i;
+	int lines = 0;
+
+	realobj = (char*)objp+obj_dbghead(cachep);
+	size = obj_reallen(cachep);
+
+	for (i=0;i<size;i++) {
+		char exp = POISON_FREE;
+		if (i == size-1)
+			exp = POISON_END;
+		if (realobj[i] != exp) {
+			int limit;
+			/* Mismatch ! */
+			/* Print header */
+			if (lines == 0) {
+				printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
+						realobj, size);
+				print_objinfo(cachep, objp, 0);
+			}
+			/* Hexdump the affected line */
+			i = (i/16)*16;
+			limit = 16;
+			if (i+limit > size)
+				limit = size-i;
+			dump_line(realobj, i, limit);
+			i += 16;
+			lines++;
+			/* Limit to 5 lines */
+			if (lines > 5)
+				break;
+		}
+	}
+	if (lines != 0) {
+		/* Print some data about the neighboring objects, if they
+		 * exist:
+		 */
+		struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+		int objnr;
+
+		objnr = (objp-slabp->s_mem)/cachep->objsize;
+		if (objnr) {
+			objp = slabp->s_mem+(objnr-1)*cachep->objsize;
+			realobj = (char*)objp+obj_dbghead(cachep);
+			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
+						realobj, size);
+			print_objinfo(cachep, objp, 2);
+		}
+		if (objnr+1 < cachep->num) {
+			objp = slabp->s_mem+(objnr+1)*cachep->objsize;
+			realobj = (char*)objp+obj_dbghead(cachep);
+			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
+						realobj, size);
+			print_objinfo(cachep, objp, 2);
+		}
+	}
+}
+#endif
+
+/* Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+{
+	void *addr = slabp->s_mem - slabp->colouroff;
+
+#if DEBUG
+	int i;
+	for (i = 0; i < cachep->num; i++) {
+		void *objp = slabp->s_mem + cachep->objsize * i;
+
+		if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+			if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+				kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+			else
+				check_poison_obj(cachep, objp);
+#else
+			check_poison_obj(cachep, objp);
+#endif
+		}
+		if (cachep->flags & SLAB_RED_ZONE) {
+			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+				slab_error(cachep, "start of a freed object "
+							"was overwritten");
+			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+				slab_error(cachep, "end of a freed object "
+							"was overwritten");
+		}
+		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
+			(cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+	}
+#else
+	if (cachep->dtor) {
+		int i;
+		for (i = 0; i < cachep->num; i++) {
+			void* objp = slabp->s_mem+cachep->objsize*i;
+			(cachep->dtor)(objp, cachep, 0);
+		}
+	}
+#endif
+
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+		struct slab_rcu *slab_rcu;
+
+		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu->cachep = cachep;
+		slab_rcu->addr = addr;
+		call_rcu(&slab_rcu->head, kmem_rcu_free);
+	} else {
+		kmem_freepages(cachep, addr);
+		if (OFF_SLAB(cachep))
+			kmem_cache_free(cachep->slabp_cache, slabp);
+	}
+}
+
+/**
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ * @dtor: A destructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a int, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache
+ * and the @dtor is run before the pages are handed back.
+ *
+ * @name must be valid until the cache is destroyed. This implies that
+ * the module calling this has to destroy the cache before getting 
+ * unloaded.
+ * 
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
+ * memory pressure.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+kmem_cache_t *
+kmem_cache_create (const char *name, size_t size, size_t align,
+	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
+	void (*dtor)(void*, kmem_cache_t *, unsigned long))
+{
+	size_t left_over, slab_size, ralign;
+	kmem_cache_t *cachep = NULL;
+
+	/*
+	 * Sanity checks... these are all serious usage bugs.
+	 */
+	if ((!name) ||
+		in_interrupt() ||
+		(size < BYTES_PER_WORD) ||
+		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
+		(dtor && !ctor)) {
+			printk(KERN_ERR "%s: Early error in slab %s\n",
+					__FUNCTION__, name);
+			BUG();
+		}
+
+#if DEBUG
+	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
+	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
+		/* No constructor, but inital state check requested */
+		printk(KERN_ERR "%s: No con, but init state check "
+				"requested - %s\n", __FUNCTION__, name);
+		flags &= ~SLAB_DEBUG_INITIAL;
+	}
+
+#if FORCED_DEBUG
+	/*
+	 * Enable redzoning and last user accounting, except for caches with
+	 * large objects, if the increased size would increase the object size
+	 * above the next power of two: caches with object sizes just above a
+	 * power of two have a significant amount of internal fragmentation.
+	 */
+	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
+		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+	if (!(flags & SLAB_DESTROY_BY_RCU))
+		flags |= SLAB_POISON;
+#endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(flags & SLAB_POISON);
+#endif
+	if (flags & SLAB_DESTROY_BY_RCU)
+		BUG_ON(dtor);
+
+	/*
+	 * Always checks flags, a caller might be expecting debug
+	 * support which isn't available.
+	 */
+	if (flags & ~CREATE_MASK)
+		BUG();
+
+	/* Check that size is in terms of words.  This is needed to avoid
+	 * unaligned accesses for some archs when redzoning is used, and makes
+	 * sure any on-slab bufctl's are also correctly aligned.
+	 */
+	if (size & (BYTES_PER_WORD-1)) {
+		size += (BYTES_PER_WORD-1);
+		size &= ~(BYTES_PER_WORD-1);
+	}
+
+	/* calculate out the final buffer alignment: */
+	/* 1) arch recommendation: can be overridden for debug */
+	if (flags & SLAB_HWCACHE_ALIGN) {
+		/* Default alignment: as specified by the arch code.
+		 * Except if an object is really small, then squeeze multiple
+		 * objects into one cacheline.
+		 */
+		ralign = cache_line_size();
+		while (size <= ralign/2)
+			ralign /= 2;
+	} else {
+		ralign = BYTES_PER_WORD;
+	}
+	/* 2) arch mandated alignment: disables debug if necessary */
+	if (ralign < ARCH_SLAB_MINALIGN) {
+		ralign = ARCH_SLAB_MINALIGN;
+		if (ralign > BYTES_PER_WORD)
+			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+	}
+	/* 3) caller mandated alignment: disables debug if necessary */
+	if (ralign < align) {
+		ralign = align;
+		if (ralign > BYTES_PER_WORD)
+			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+	}
+	/* 4) Store it. Note that the debug code below can reduce
+	 *    the alignment to BYTES_PER_WORD.
+	 */
+	align = ralign;
+
+	/* Get cache's description obj. */
+	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+	if (!cachep)
+		goto opps;
+	memset(cachep, 0, sizeof(kmem_cache_t));
+
+#if DEBUG
+	cachep->reallen = size;
+
+	if (flags & SLAB_RED_ZONE) {
+		/* redzoning only works with word aligned caches */
+		align = BYTES_PER_WORD;
+
+		/* add space for red zone words */
+		cachep->dbghead += BYTES_PER_WORD;
+		size += 2*BYTES_PER_WORD;
+	}
+	if (flags & SLAB_STORE_USER) {
+		/* user store requires word alignment and
+		 * one word storage behind the end of the real
+		 * object.
+		 */
+		align = BYTES_PER_WORD;
+		size += BYTES_PER_WORD;
+	}
+#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
+	if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+		cachep->dbghead += PAGE_SIZE - size;
+		size = PAGE_SIZE;
+	}
+#endif
+#endif
+
+	/* Determine if the slab management is 'on' or 'off' slab. */
+	if (size >= (PAGE_SIZE>>3))
+		/*
+		 * Size is large, assume best to place the slab management obj
+		 * off-slab (should allow better packing of objs).
+		 */
+		flags |= CFLGS_OFF_SLAB;
+
+	size = ALIGN(size, align);
+
+	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
+		/*
+		 * A VFS-reclaimable slab tends to have most allocations
+		 * as GFP_NOFS and we really don't want to have to be allocating
+		 * higher-order pages when we are unable to shrink dcache.
+		 */
+		cachep->gfporder = 0;
+		cache_estimate(cachep->gfporder, size, align, flags,
+					&left_over, &cachep->num);
+	} else {
+		/*
+		 * Calculate size (in pages) of slabs, and the num of objs per
+		 * slab.  This could be made much more intelligent.  For now,
+		 * try to avoid using high page-orders for slabs.  When the
+		 * gfp() funcs are more friendly towards high-order requests,
+		 * this should be changed.
+		 */
+		do {
+			unsigned int break_flag = 0;
+cal_wastage:
+			cache_estimate(cachep->gfporder, size, align, flags,
+						&left_over, &cachep->num);
+			if (break_flag)
+				break;
+			if (cachep->gfporder >= MAX_GFP_ORDER)
+				break;
+			if (!cachep->num)
+				goto next;
+			if (flags & CFLGS_OFF_SLAB &&
+					cachep->num > offslab_limit) {
+				/* This num of objs will cause problems. */
+				cachep->gfporder--;
+				break_flag++;
+				goto cal_wastage;
+			}
+
+			/*
+			 * Large num of objs is good, but v. large slabs are
+			 * currently bad for the gfp()s.
+			 */
+			if (cachep->gfporder >= slab_break_gfp_order)
+				break;
+
+			if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
+				break;	/* Acceptable internal fragmentation. */
+next:
+			cachep->gfporder++;
+		} while (1);
+	}
+
+	if (!cachep->num) {
+		printk("kmem_cache_create: couldn't create cache %s.\n", name);
+		kmem_cache_free(&cache_cache, cachep);
+		cachep = NULL;
+		goto opps;
+	}
+	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+				+ sizeof(struct slab), align);
+
+	/*
+	 * If the slab has been placed off-slab, and we have enough space then
+	 * move it on-slab. This is at the expense of any extra colouring.
+	 */
+	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
+		flags &= ~CFLGS_OFF_SLAB;
+		left_over -= slab_size;
+	}
+
+	if (flags & CFLGS_OFF_SLAB) {
+		/* really off slab. No need for manual alignment */
+		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+	}
+
+	cachep->colour_off = cache_line_size();
+	/* Offset must be a multiple of the alignment. */
+	if (cachep->colour_off < align)
+		cachep->colour_off = align;
+	cachep->colour = left_over/cachep->colour_off;
+	cachep->slab_size = slab_size;
+	cachep->flags = flags;
+	cachep->gfpflags = 0;
+	if (flags & SLAB_CACHE_DMA)
+		cachep->gfpflags |= GFP_DMA;
+	spin_lock_init(&cachep->spinlock);
+	cachep->objsize = size;
+	/* NUMA */
+	INIT_LIST_HEAD(&cachep->lists.slabs_full);
+	INIT_LIST_HEAD(&cachep->lists.slabs_partial);
+	INIT_LIST_HEAD(&cachep->lists.slabs_free);
+
+	if (flags & CFLGS_OFF_SLAB)
+		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
+	cachep->ctor = ctor;
+	cachep->dtor = dtor;
+	cachep->name = name;
+
+	/* Don't let CPUs to come and go */
+	lock_cpu_hotplug();
+
+	if (g_cpucache_up == FULL) {
+		enable_cpucache(cachep);
+	} else {
+		if (g_cpucache_up == NONE) {
+			/* Note: the first kmem_cache_create must create
+			 * the cache that's used by kmalloc(24), otherwise
+			 * the creation of further caches will BUG().
+			 */
+			cachep->array[smp_processor_id()] = &initarray_generic.cache;
+			g_cpucache_up = PARTIAL;
+		} else {
+			cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+		}
+		BUG_ON(!ac_data(cachep));
+		ac_data(cachep)->avail = 0;
+		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+		ac_data(cachep)->batchcount = 1;
+		ac_data(cachep)->touched = 0;
+		cachep->batchcount = 1;
+		cachep->limit = BOOT_CPUCACHE_ENTRIES;
+		cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+					+ cachep->num;
+	} 
+
+	cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
+					((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
+	/* Need the semaphore to access the chain. */
+	down(&cache_chain_sem);
+	{
+		struct list_head *p;
+		mm_segment_t old_fs;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		list_for_each(p, &cache_chain) {
+			kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+			char tmp;
+			/* This happens when the module gets unloaded and doesn't
+			   destroy its slab cache and noone else reuses the vmalloc
+			   area of the module. Print a warning. */
+			if (__get_user(tmp,pc->name)) { 
+				printk("SLAB: cache with size %d has lost its name\n", 
+					pc->objsize); 
+				continue; 
+			} 	
+			if (!strcmp(pc->name,name)) { 
+				printk("kmem_cache_create: duplicate cache %s\n",name); 
+				up(&cache_chain_sem); 
+				unlock_cpu_hotplug();
+				BUG(); 
+			}	
+		}
+		set_fs(old_fs);
+	}
+
+	/* cache setup completed, link it into the list */
+	list_add(&cachep->next, &cache_chain);
+	up(&cache_chain_sem);
+	unlock_cpu_hotplug();
+opps:
+	if (!cachep && (flags & SLAB_PANIC))
+		panic("kmem_cache_create(): failed to create slab `%s'\n",
+			name);
+	return cachep;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#if DEBUG
+static void check_irq_off(void)
+{
+	BUG_ON(!irqs_disabled());
+}
+
+static void check_irq_on(void)
+{
+	BUG_ON(irqs_disabled());
+}
+
+static void check_spinlock_acquired(kmem_cache_t *cachep)
+{
+#ifdef CONFIG_SMP
+	check_irq_off();
+	BUG_ON(spin_trylock(&cachep->spinlock));
+#endif
+}
+#else
+#define check_irq_off()	do { } while(0)
+#define check_irq_on()	do { } while(0)
+#define check_spinlock_acquired(x) do { } while(0)
+#endif
+
+/*
+ * Waits for all CPUs to execute func().
+ */
+static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+{
+	check_irq_on();
+	preempt_disable();
+
+	local_irq_disable();
+	func(arg);
+	local_irq_enable();
+
+	if (smp_call_function(func, arg, 1, 1))
+		BUG();
+
+	preempt_enable();
+}
+
+static void drain_array_locked(kmem_cache_t* cachep,
+				struct array_cache *ac, int force);
+
+static void do_drain(void *arg)
+{
+	kmem_cache_t *cachep = (kmem_cache_t*)arg;
+	struct array_cache *ac;
+
+	check_irq_off();
+	ac = ac_data(cachep);
+	spin_lock(&cachep->spinlock);
+	free_block(cachep, &ac_entry(ac)[0], ac->avail);
+	spin_unlock(&cachep->spinlock);
+	ac->avail = 0;
+}
+
+static void drain_cpu_caches(kmem_cache_t *cachep)
+{
+	smp_call_function_all_cpus(do_drain, cachep);
+	check_irq_on();
+	spin_lock_irq(&cachep->spinlock);
+	if (cachep->lists.shared)
+		drain_array_locked(cachep, cachep->lists.shared, 1);
+	spin_unlock_irq(&cachep->spinlock);
+}
+
+
+/* NUMA shrink all list3s */
+static int __cache_shrink(kmem_cache_t *cachep)
+{
+	struct slab *slabp;
+	int ret;
+
+	drain_cpu_caches(cachep);
+
+	check_irq_on();
+	spin_lock_irq(&cachep->spinlock);
+
+	for(;;) {
+		struct list_head *p;
+
+		p = cachep->lists.slabs_free.prev;
+		if (p == &cachep->lists.slabs_free)
+			break;
+
+		slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+#if DEBUG
+		if (slabp->inuse)
+			BUG();
+#endif
+		list_del(&slabp->list);
+
+		cachep->lists.free_objects -= cachep->num;
+		spin_unlock_irq(&cachep->spinlock);
+		slab_destroy(cachep, slabp);
+		spin_lock_irq(&cachep->spinlock);
+	}
+	ret = !list_empty(&cachep->lists.slabs_full) ||
+		!list_empty(&cachep->lists.slabs_partial);
+	spin_unlock_irq(&cachep->spinlock);
+	return ret;
+}
+
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(kmem_cache_t *cachep)
+{
+	if (!cachep || in_interrupt())
+		BUG();
+
+	return __cache_shrink(cachep);
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+/**
+ * kmem_cache_destroy - delete a cache
+ * @cachep: the cache to destroy
+ *
+ * Remove a kmem_cache_t object from the slab cache.
+ * Returns 0 on success.
+ *
+ * It is expected this function will be called by a module when it is
+ * unloaded.  This will remove the cache completely, and avoid a duplicate
+ * cache being allocated each time a module is loaded and unloaded, if the
+ * module doesn't have persistent in-kernel storage across loads and unloads.
+ *
+ * The cache must be empty before calling this function.
+ *
+ * The caller must guarantee that noone will allocate memory from the cache
+ * during the kmem_cache_destroy().
+ */
+int kmem_cache_destroy(kmem_cache_t * cachep)
+{
+	int i;
+
+	if (!cachep || in_interrupt())
+		BUG();
+
+	/* Don't let CPUs to come and go */
+	lock_cpu_hotplug();
+
+	/* Find the cache in the chain of caches. */
+	down(&cache_chain_sem);
+	/*
+	 * the chain is never empty, cache_cache is never destroyed
+	 */
+	list_del(&cachep->next);
+	up(&cache_chain_sem);
+
+	if (__cache_shrink(cachep)) {
+		slab_error(cachep, "Can't free all objects");
+		down(&cache_chain_sem);
+		list_add(&cachep->next,&cache_chain);
+		up(&cache_chain_sem);
+		unlock_cpu_hotplug();
+		return 1;
+	}
+
+	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+		synchronize_kernel();
+
+	/* no cpu_online check required here since we clear the percpu
+	 * array on cpu offline and set this to NULL.
+	 */
+	for (i = 0; i < NR_CPUS; i++)
+		kfree(cachep->array[i]);
+
+	/* NUMA: free the list3 structures */
+	kfree(cachep->lists.shared);
+	cachep->lists.shared = NULL;
+	kmem_cache_free(&cache_cache, cachep);
+
+	unlock_cpu_hotplug();
+
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/* Get the memory for a slab management obj. */
+static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
+			void *objp, int colour_off, unsigned int __nocast local_flags)
+{
+	struct slab *slabp;
+	
+	if (OFF_SLAB(cachep)) {
+		/* Slab management obj is off-slab. */
+		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
+		if (!slabp)
+			return NULL;
+	} else {
+		slabp = objp+colour_off;
+		colour_off += cachep->slab_size;
+	}
+	slabp->inuse = 0;
+	slabp->colouroff = colour_off;
+	slabp->s_mem = objp+colour_off;
+
+	return slabp;
+}
+
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+	return (kmem_bufctl_t *)(slabp+1);
+}
+
+static void cache_init_objs(kmem_cache_t *cachep,
+			struct slab *slabp, unsigned long ctor_flags)
+{
+	int i;
+
+	for (i = 0; i < cachep->num; i++) {
+		void* objp = slabp->s_mem+cachep->objsize*i;
+#if DEBUG
+		/* need to poison the objs? */
+		if (cachep->flags & SLAB_POISON)
+			poison_obj(cachep, objp, POISON_FREE);
+		if (cachep->flags & SLAB_STORE_USER)
+			*dbg_userword(cachep, objp) = NULL;
+
+		if (cachep->flags & SLAB_RED_ZONE) {
+			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
+			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
+		}
+		/*
+		 * Constructors are not allowed to allocate memory from
+		 * the same cache which they are a constructor for.
+		 * Otherwise, deadlock. They must also be threaded.
+		 */
+		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+			cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+
+		if (cachep->flags & SLAB_RED_ZONE) {
+			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+				slab_error(cachep, "constructor overwrote the"
+							" end of an object");
+			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+				slab_error(cachep, "constructor overwrote the"
+							" start of an object");
+		}
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+#else
+		if (cachep->ctor)
+			cachep->ctor(objp, cachep, ctor_flags);
+#endif
+		slab_bufctl(slabp)[i] = i+1;
+	}
+	slab_bufctl(slabp)[i-1] = BUFCTL_END;
+	slabp->free = 0;
+}
+
+static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags)
+{
+	if (flags & SLAB_DMA) {
+		if (!(cachep->gfpflags & GFP_DMA))
+			BUG();
+	} else {
+		if (cachep->gfpflags & GFP_DMA)
+			BUG();
+	}
+}
+
+static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+{
+	int i;
+	struct page *page;
+
+	/* Nasty!!!!!! I hope this is OK. */
+	i = 1 << cachep->gfporder;
+	page = virt_to_page(objp);
+	do {
+		SET_PAGE_CACHE(page, cachep);
+		SET_PAGE_SLAB(page, slabp);
+		page++;
+	} while (--i);
+}
+
+/*
+ * Grow (by 1) the number of slabs within a cache.  This is called by
+ * kmem_cache_alloc() when there are no active objs left in a cache.
+ */
+static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+{
+	struct slab	*slabp;
+	void		*objp;
+	size_t		 offset;
+	unsigned int	 local_flags;
+	unsigned long	 ctor_flags;
+
+	/* Be lazy and only check for valid flags here,
+ 	 * keeping it out of the critical path in kmem_cache_alloc().
+	 */
+	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+		BUG();
+	if (flags & SLAB_NO_GROW)
+		return 0;
+
+	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+	local_flags = (flags & SLAB_LEVEL_MASK);
+	if (!(local_flags & __GFP_WAIT))
+		/*
+		 * Not allowed to sleep.  Need to tell a constructor about
+		 * this - it might need to know...
+		 */
+		ctor_flags |= SLAB_CTOR_ATOMIC;
+
+	/* About to mess with non-constant members - lock. */
+	check_irq_off();
+	spin_lock(&cachep->spinlock);
+
+	/* Get colour for the slab, and cal the next value. */
+	offset = cachep->colour_next;
+	cachep->colour_next++;
+	if (cachep->colour_next >= cachep->colour)
+		cachep->colour_next = 0;
+	offset *= cachep->colour_off;
+
+	spin_unlock(&cachep->spinlock);
+
+	if (local_flags & __GFP_WAIT)
+		local_irq_enable();
+
+	/*
+	 * The test for missing atomic flag is performed here, rather than
+	 * the more obvious place, simply to reduce the critical path length
+	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
+	 * will eventually be caught here (where it matters).
+	 */
+	kmem_flagcheck(cachep, flags);
+
+
+	/* Get mem for the objs. */
+	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+		goto failed;
+
+	/* Get slab management. */
+	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+		goto opps1;
+
+	set_slab_attr(cachep, slabp, objp);
+
+	cache_init_objs(cachep, slabp, ctor_flags);
+
+	if (local_flags & __GFP_WAIT)
+		local_irq_disable();
+	check_irq_off();
+	spin_lock(&cachep->spinlock);
+
+	/* Make slab active. */
+	list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
+	STATS_INC_GROWN(cachep);
+	list3_data(cachep)->free_objects += cachep->num;
+	spin_unlock(&cachep->spinlock);
+	return 1;
+opps1:
+	kmem_freepages(cachep, objp);
+failed:
+	if (local_flags & __GFP_WAIT)
+		local_irq_disable();
+	return 0;
+}
+
+#if DEBUG
+
+/*
+ * Perform extra freeing checks:
+ * - detect bad pointers.
+ * - POISON/RED_ZONE checking
+ * - destructor calls, for caches with POISON+dtor
+ */
+static void kfree_debugcheck(const void *objp)
+{
+	struct page *page;
+
+	if (!virt_addr_valid(objp)) {
+		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+			(unsigned long)objp);	
+		BUG();	
+	}
+	page = virt_to_page(objp);
+	if (!PageSlab(page)) {
+		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+		BUG();
+	}
+}
+
+static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
+					void *caller)
+{
+	struct page *page;
+	unsigned int objnr;
+	struct slab *slabp;
+
+	objp -= obj_dbghead(cachep);
+	kfree_debugcheck(objp);
+	page = virt_to_page(objp);
+
+	if (GET_PAGE_CACHE(page) != cachep) {
+		printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+				GET_PAGE_CACHE(page),cachep);
+		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
+		printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
+		WARN_ON(1);
+	}
+	slabp = GET_PAGE_SLAB(page);
+
+	if (cachep->flags & SLAB_RED_ZONE) {
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
+			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		}
+		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
+		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
+	}
+	if (cachep->flags & SLAB_STORE_USER)
+		*dbg_userword(cachep, objp) = caller;
+
+	objnr = (objp-slabp->s_mem)/cachep->objsize;
+
+	BUG_ON(objnr >= cachep->num);
+	BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+
+	if (cachep->flags & SLAB_DEBUG_INITIAL) {
+		/* Need to call the slab's constructor so the
+		 * caller can perform a verify of its state (debugging).
+		 * Called without the cache-lock held.
+		 */
+		cachep->ctor(objp+obj_dbghead(cachep),
+					cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+	}
+	if (cachep->flags & SLAB_POISON && cachep->dtor) {
+		/* we want to cache poison the object,
+		 * call the destruction callback
+		 */
+		cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+	}
+	if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+			store_stackinfo(cachep, objp, (unsigned long)caller);
+	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+		} else {
+			poison_obj(cachep, objp, POISON_FREE);
+		}
+#else
+		poison_obj(cachep, objp, POISON_FREE);
+#endif
+	}
+	return objp;
+}
+
+static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
+{
+	kmem_bufctl_t i;
+	int entries = 0;
+	
+	check_spinlock_acquired(cachep);
+	/* Check slab's freelist to see if this obj is there. */
+	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
+		entries++;
+		if (entries > cachep->num || i >= cachep->num)
+			goto bad;
+	}
+	if (entries != cachep->num - slabp->inuse) {
+bad:
+		printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+				cachep->name, cachep->num, slabp, slabp->inuse);
+		for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
+			if ((i%16)==0)
+				printk("\n%03x:", i);
+			printk(" %02x", ((unsigned char*)slabp)[i]);
+		}
+		printk("\n");
+		BUG();
+	}
+}
+#else
+#define kfree_debugcheck(x) do { } while(0)
+#define cache_free_debugcheck(x,objp,z) (objp)
+#define check_slabp(x,y) do { } while(0)
+#endif
+
+static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
+{
+	int batchcount;
+	struct kmem_list3 *l3;
+	struct array_cache *ac;
+
+	check_irq_off();
+	ac = ac_data(cachep);
+retry:
+	batchcount = ac->batchcount;
+	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
+		/* if there was little recent activity on this
+		 * cache, then perform only a partial refill.
+		 * Otherwise we could generate refill bouncing.
+		 */
+		batchcount = BATCHREFILL_LIMIT;
+	}
+	l3 = list3_data(cachep);
+
+	BUG_ON(ac->avail > 0);
+	spin_lock(&cachep->spinlock);
+	if (l3->shared) {
+		struct array_cache *shared_array = l3->shared;
+		if (shared_array->avail) {
+			if (batchcount > shared_array->avail)
+				batchcount = shared_array->avail;
+			shared_array->avail -= batchcount;
+			ac->avail = batchcount;
+			memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
+					sizeof(void*)*batchcount);
+			shared_array->touched = 1;
+			goto alloc_done;
+		}
+	}
+	while (batchcount > 0) {
+		struct list_head *entry;
+		struct slab *slabp;
+		/* Get slab alloc is to come from. */
+		entry = l3->slabs_partial.next;
+		if (entry == &l3->slabs_partial) {
+			l3->free_touched = 1;
+			entry = l3->slabs_free.next;
+			if (entry == &l3->slabs_free)
+				goto must_grow;
+		}
+
+		slabp = list_entry(entry, struct slab, list);
+		check_slabp(cachep, slabp);
+		check_spinlock_acquired(cachep);
+		while (slabp->inuse < cachep->num && batchcount--) {
+			kmem_bufctl_t next;
+			STATS_INC_ALLOCED(cachep);
+			STATS_INC_ACTIVE(cachep);
+			STATS_SET_HIGH(cachep);
+
+			/* get obj pointer */
+			ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
+
+			slabp->inuse++;
+			next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+		       	slabp->free = next;
+		}
+		check_slabp(cachep, slabp);
+
+		/* move slabp to correct slabp list: */
+		list_del(&slabp->list);
+		if (slabp->free == BUFCTL_END)
+			list_add(&slabp->list, &l3->slabs_full);
+		else
+			list_add(&slabp->list, &l3->slabs_partial);
+	}
+
+must_grow:
+	l3->free_objects -= ac->avail;
+alloc_done:
+	spin_unlock(&cachep->spinlock);
+
+	if (unlikely(!ac->avail)) {
+		int x;
+		x = cache_grow(cachep, flags, -1);
+		
+		// cache_grow can reenable interrupts, then ac could change.
+		ac = ac_data(cachep);
+		if (!x && ac->avail == 0)	// no objects in sight? abort
+			return NULL;
+
+		if (!ac->avail)		// objects refilled by interrupt?
+			goto retry;
+	}
+	ac->touched = 1;
+	return ac_entry(ac)[--ac->avail];
+}
+
+static inline void
+cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
+{
+	might_sleep_if(flags & __GFP_WAIT);
+#if DEBUG
+	kmem_flagcheck(cachep, flags);
+#endif
+}
+
+#if DEBUG
+static void *
+cache_alloc_debugcheck_after(kmem_cache_t *cachep,
+			unsigned long flags, void *objp, void *caller)
+{
+	if (!objp)	
+		return objp;
+ 	if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+			kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+		else
+			check_poison_obj(cachep, objp);
+#else
+		check_poison_obj(cachep, objp);
+#endif
+		poison_obj(cachep, objp, POISON_INUSE);
+	}
+	if (cachep->flags & SLAB_STORE_USER)
+		*dbg_userword(cachep, objp) = caller;
+
+	if (cachep->flags & SLAB_RED_ZONE) {
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
+			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		}
+		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
+		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
+	}
+	objp += obj_dbghead(cachep);
+	if (cachep->ctor && cachep->flags & SLAB_POISON) {
+		unsigned long	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+
+		if (!(flags & __GFP_WAIT))
+			ctor_flags |= SLAB_CTOR_ATOMIC;
+
+		cachep->ctor(objp, cachep, ctor_flags);
+	}	
+	return objp;
+}
+#else
+#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#endif
+
+
+static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+{
+	unsigned long save_flags;
+	void* objp;
+	struct array_cache *ac;
+
+	cache_alloc_debugcheck_before(cachep, flags);
+
+	local_irq_save(save_flags);
+	ac = ac_data(cachep);
+	if (likely(ac->avail)) {
+		STATS_INC_ALLOCHIT(cachep);
+		ac->touched = 1;
+		objp = ac_entry(ac)[--ac->avail];
+	} else {
+		STATS_INC_ALLOCMISS(cachep);
+		objp = cache_alloc_refill(cachep, flags);
+	}
+	local_irq_restore(save_flags);
+	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
+	return objp;
+}
+
+/* 
+ * NUMA: different approach needed if the spinlock is moved into
+ * the l3 structure
+ */
+
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
+{
+	int i;
+
+	check_spinlock_acquired(cachep);
+
+	/* NUMA: move add into loop */
+	cachep->lists.free_objects += nr_objects;
+
+	for (i = 0; i < nr_objects; i++) {
+		void *objp = objpp[i];
+		struct slab *slabp;
+		unsigned int objnr;
+
+		slabp = GET_PAGE_SLAB(virt_to_page(objp));
+		list_del(&slabp->list);
+		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		check_slabp(cachep, slabp);
+#if DEBUG
+		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+			printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
+						cachep->name, objp);
+			BUG();
+		}
+#endif
+		slab_bufctl(slabp)[objnr] = slabp->free;
+		slabp->free = objnr;
+		STATS_DEC_ACTIVE(cachep);
+		slabp->inuse--;
+		check_slabp(cachep, slabp);
+
+		/* fixup slab chains */
+		if (slabp->inuse == 0) {
+			if (cachep->lists.free_objects > cachep->free_limit) {
+				cachep->lists.free_objects -= cachep->num;
+				slab_destroy(cachep, slabp);
+			} else {
+				list_add(&slabp->list,
+				&list3_data_ptr(cachep, objp)->slabs_free);
+			}
+		} else {
+			/* Unconditionally move a slab to the end of the
+			 * partial list on free - maximum time for the
+			 * other objects to be freed, too.
+			 */
+			list_add_tail(&slabp->list,
+				&list3_data_ptr(cachep, objp)->slabs_partial);
+		}
+	}
+}
+
+static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
+{
+	int batchcount;
+
+	batchcount = ac->batchcount;
+#if DEBUG
+	BUG_ON(!batchcount || batchcount > ac->avail);
+#endif
+	check_irq_off();
+	spin_lock(&cachep->spinlock);
+	if (cachep->lists.shared) {
+		struct array_cache *shared_array = cachep->lists.shared;
+		int max = shared_array->limit-shared_array->avail;
+		if (max) {
+			if (batchcount > max)
+				batchcount = max;
+			memcpy(&ac_entry(shared_array)[shared_array->avail],
+					&ac_entry(ac)[0],
+					sizeof(void*)*batchcount);
+			shared_array->avail += batchcount;
+			goto free_done;
+		}
+	}
+
+	free_block(cachep, &ac_entry(ac)[0], batchcount);
+free_done:
+#if STATS
+	{
+		int i = 0;
+		struct list_head *p;
+
+		p = list3_data(cachep)->slabs_free.next;
+		while (p != &(list3_data(cachep)->slabs_free)) {
+			struct slab *slabp;
+
+			slabp = list_entry(p, struct slab, list);
+			BUG_ON(slabp->inuse);
+
+			i++;
+			p = p->next;
+		}
+		STATS_SET_FREEABLE(cachep, i);
+	}
+#endif
+	spin_unlock(&cachep->spinlock);
+	ac->avail -= batchcount;
+	memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
+			sizeof(void*)*ac->avail);
+}
+
+/*
+ * __cache_free
+ * Release an obj back to its cache. If the obj has a constructed
+ * state, it must be in this state _before_ it is released.
+ *
+ * Called with disabled ints.
+ */
+static inline void __cache_free(kmem_cache_t *cachep, void *objp)
+{
+	struct array_cache *ac = ac_data(cachep);
+
+	check_irq_off();
+	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+
+	if (likely(ac->avail < ac->limit)) {
+		STATS_INC_FREEHIT(cachep);
+		ac_entry(ac)[ac->avail++] = objp;
+		return;
+	} else {
+		STATS_INC_FREEMISS(cachep);
+		cache_flusharray(cachep, ac);
+		ac_entry(ac)[ac->avail++] = objp;
+	}
+}
+
+/**
+ * kmem_cache_alloc - Allocate an object
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache.  The flags are only relevant
+ * if the cache has no available objects.
+ */
+void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+{
+	return __cache_alloc(cachep, flags);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+/**
+ * kmem_ptr_validate - check if an untrusted pointer might
+ *	be a slab entry.
+ * @cachep: the cache we're checking against
+ * @ptr: pointer to validate
+ *
+ * This verifies that the untrusted pointer looks sane:
+ * it is _not_ a guarantee that the pointer is actually
+ * part of the slab cache in question, but it at least
+ * validates that the pointer can be dereferenced and
+ * looks half-way sane.
+ *
+ * Currently only used for dentry validation.
+ */
+int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
+{
+	unsigned long addr = (unsigned long) ptr;
+	unsigned long min_addr = PAGE_OFFSET;
+	unsigned long align_mask = BYTES_PER_WORD-1;
+	unsigned long size = cachep->objsize;
+	struct page *page;
+
+	if (unlikely(addr < min_addr))
+		goto out;
+	if (unlikely(addr > (unsigned long)high_memory - size))
+		goto out;
+	if (unlikely(addr & align_mask))
+		goto out;
+	if (unlikely(!kern_addr_valid(addr)))
+		goto out;
+	if (unlikely(!kern_addr_valid(addr + size - 1)))
+		goto out;
+	page = virt_to_page(ptr);
+	if (unlikely(!PageSlab(page)))
+		goto out;
+	if (unlikely(GET_PAGE_CACHE(page) != cachep))
+		goto out;
+	return 1;
+out:
+	return 0;
+}
+
+#ifdef CONFIG_NUMA
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc, except that this function is slow
+ * and can sleep. And it will allocate memory on the given node, which
+ * can improve the performance for cpu bound structures.
+ */
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
+{
+	int loop;
+	void *objp;
+	struct slab *slabp;
+	kmem_bufctl_t next;
+
+	for (loop = 0;;loop++) {
+		struct list_head *q;
+
+		objp = NULL;
+		check_irq_on();
+		spin_lock_irq(&cachep->spinlock);
+		/* walk through all partial and empty slab and find one
+		 * from the right node */
+		list_for_each(q,&cachep->lists.slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+
+			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
+					loop > 2)
+				goto got_slabp;
+		}
+		list_for_each(q, &cachep->lists.slabs_free) {
+			slabp = list_entry(q, struct slab, list);
+
+			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
+					loop > 2)
+				goto got_slabp;
+		}
+		spin_unlock_irq(&cachep->spinlock);
+
+		local_irq_disable();
+		if (!cache_grow(cachep, GFP_KERNEL, nodeid)) {
+			local_irq_enable();
+			return NULL;
+		}
+		local_irq_enable();
+	}
+got_slabp:
+	/* found one: allocate object */
+	check_slabp(cachep, slabp);
+	check_spinlock_acquired(cachep);
+
+	STATS_INC_ALLOCED(cachep);
+	STATS_INC_ACTIVE(cachep);
+	STATS_SET_HIGH(cachep);
+	STATS_INC_NODEALLOCS(cachep);
+
+	objp = slabp->s_mem + slabp->free*cachep->objsize;
+
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+	slabp->free = next;
+	check_slabp(cachep, slabp);
+
+	/* move slabp to correct slabp list: */
+	list_del(&slabp->list);
+	if (slabp->free == BUFCTL_END)
+		list_add(&slabp->list, &cachep->lists.slabs_full);
+	else
+		list_add(&slabp->list, &cachep->lists.slabs_partial);
+
+	list3_data(cachep)->free_objects--;
+	spin_unlock_irq(&cachep->spinlock);
+
+	objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
+					__builtin_return_address(0));
+	return objp;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+#endif
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ *
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
+ *
+ * Additionally, the %GFP_DMA flag may be set to indicate the memory
+ * must be suitable for DMA.  This can mean different things on different
+ * platforms.  For example, on i386, it means that the memory must come
+ * from the first 16MB.
+ */
+void *__kmalloc(size_t size, unsigned int __nocast flags)
+{
+	kmem_cache_t *cachep;
+
+	cachep = kmem_find_general_cachep(size, flags);
+	if (unlikely(cachep == NULL))
+		return NULL;
+	return __cache_alloc(cachep, flags);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+#ifdef CONFIG_SMP
+/**
+ * __alloc_percpu - allocate one copy of the object for every present
+ * cpu in the system, zeroing them.
+ * Objects should be dereferenced using the per_cpu_ptr macro only.
+ *
+ * @size: how many bytes of memory are required.
+ * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmem_cache_alloc_node(
+				kmem_find_general_cachep(size, GFP_KERNEL),
+				cpu_to_node(i));
+
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+#endif
+
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(kmem_cache_t *cachep, void *objp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__cache_free(cachep, objp);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+/**
+ * kcalloc - allocate memory for an array. The memory is set to zero.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate.
+ */
+void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+{
+	void *ret = NULL;
+
+	if (n != 0 && size > INT_MAX / n)
+		return ret;
+
+	ret = kmalloc(n * size, flags);
+	if (ret)
+		memset(ret, 0, n * size);
+	return ret;
+}
+EXPORT_SYMBOL(kcalloc);
+
+/**
+ * kfree - free previously allocated memory
+ * @objp: pointer returned by kmalloc.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *objp)
+{
+	kmem_cache_t *c;
+	unsigned long flags;
+
+	if (unlikely(!objp))
+		return;
+	local_irq_save(flags);
+	kfree_debugcheck(objp);
+	c = GET_PAGE_CACHE(virt_to_page(objp));
+	__cache_free(c, (void*)objp);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(kfree);
+
+#ifdef CONFIG_SMP
+/**
+ * free_percpu - free previously allocated percpu memory
+ * @objp: pointer returned by alloc_percpu.
+ *
+ * Don't free memory not originally allocated by alloc_percpu()
+ * The complemented objp is to check for that.
+ */
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+#endif
+
+unsigned int kmem_cache_size(kmem_cache_t *cachep)
+{
+	return obj_reallen(cachep);
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+struct ccupdate_struct {
+	kmem_cache_t *cachep;
+	struct array_cache *new[NR_CPUS];
+};
+
+static void do_ccupdate_local(void *info)
+{
+	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+	struct array_cache *old;
+
+	check_irq_off();
+	old = ac_data(new->cachep);
+	
+	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
+	new->new[smp_processor_id()] = old;
+}
+
+
+static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
+				int shared)
+{
+	struct ccupdate_struct new;
+	struct array_cache *new_shared;
+	int i;
+
+	memset(&new.new,0,sizeof(new.new));
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_online(i)) {
+			new.new[i] = alloc_arraycache(i, limit, batchcount);
+			if (!new.new[i]) {
+				for (i--; i >= 0; i--) kfree(new.new[i]);
+				return -ENOMEM;
+			}
+		} else {
+			new.new[i] = NULL;
+		}
+	}
+	new.cachep = cachep;
+
+	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+	
+	check_irq_on();
+	spin_lock_irq(&cachep->spinlock);
+	cachep->batchcount = batchcount;
+	cachep->limit = limit;
+	cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
+	spin_unlock_irq(&cachep->spinlock);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		struct array_cache *ccold = new.new[i];
+		if (!ccold)
+			continue;
+		spin_lock_irq(&cachep->spinlock);
+		free_block(cachep, ac_entry(ccold), ccold->avail);
+		spin_unlock_irq(&cachep->spinlock);
+		kfree(ccold);
+	}
+	new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
+	if (new_shared) {
+		struct array_cache *old;
+
+		spin_lock_irq(&cachep->spinlock);
+		old = cachep->lists.shared;
+		cachep->lists.shared = new_shared;
+		if (old)
+			free_block(cachep, ac_entry(old), old->avail);
+		spin_unlock_irq(&cachep->spinlock);
+		kfree(old);
+	}
+
+	return 0;
+}
+
+
+static void enable_cpucache(kmem_cache_t *cachep)
+{
+	int err;
+	int limit, shared;
+
+	/* The head array serves three purposes:
+	 * - create a LIFO ordering, i.e. return objects that are cache-warm
+	 * - reduce the number of spinlock operations.
+	 * - reduce the number of linked list operations on the slab and 
+	 *   bufctl chains: array operations are cheaper.
+	 * The numbers are guessed, we should auto-tune as described by
+	 * Bonwick.
+	 */
+	if (cachep->objsize > 131072)
+		limit = 1;
+	else if (cachep->objsize > PAGE_SIZE)
+		limit = 8;
+	else if (cachep->objsize > 1024)
+		limit = 24;
+	else if (cachep->objsize > 256)
+		limit = 54;
+	else
+		limit = 120;
+
+	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+	 * allocation behaviour: Most allocs on one cpu, most free operations
+	 * on another cpu. For these cases, an efficient object passing between
+	 * cpus is necessary. This is provided by a shared array. The array
+	 * replaces Bonwick's magazine layer.
+	 * On uniprocessor, it's functionally equivalent (but less efficient)
+	 * to a larger limit. Thus disabled by default.
+	 */
+	shared = 0;
+#ifdef CONFIG_SMP
+	if (cachep->objsize <= PAGE_SIZE)
+		shared = 8;
+#endif
+
+#if DEBUG
+	/* With debugging enabled, large batchcount lead to excessively
+	 * long periods with disabled local interrupts. Limit the 
+	 * batchcount
+	 */
+	if (limit > 32)
+		limit = 32;
+#endif
+	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+	if (err)
+		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+					cachep->name, -err);
+}
+
+static void drain_array_locked(kmem_cache_t *cachep,
+				struct array_cache *ac, int force)
+{
+	int tofree;
+
+	check_spinlock_acquired(cachep);
+	if (ac->touched && !force) {
+		ac->touched = 0;
+	} else if (ac->avail) {
+		tofree = force ? ac->avail : (ac->limit+4)/5;
+		if (tofree > ac->avail) {
+			tofree = (ac->avail+1)/2;
+		}
+		free_block(cachep, ac_entry(ac), tofree);
+		ac->avail -= tofree;
+		memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
+					sizeof(void*)*ac->avail);
+	}
+}
+
+/**
+ * cache_reap - Reclaim memory from caches.
+ *
+ * Called from workqueue/eventd every few seconds.
+ * Purpose:
+ * - clear the per-cpu caches for this CPU.
+ * - return freeable pages to the main free memory pool.
+ *
+ * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * try again on the next iteration.
+ */
+static void cache_reap(void *unused)
+{
+	struct list_head *walk;
+
+	if (down_trylock(&cache_chain_sem)) {
+		/* Give up. Setup the next iteration. */
+		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+		return;
+	}
+
+	list_for_each(walk, &cache_chain) {
+		kmem_cache_t *searchp;
+		struct list_head* p;
+		int tofree;
+		struct slab *slabp;
+
+		searchp = list_entry(walk, kmem_cache_t, next);
+
+		if (searchp->flags & SLAB_NO_REAP)
+			goto next;
+
+		check_irq_on();
+
+		spin_lock_irq(&searchp->spinlock);
+
+		drain_array_locked(searchp, ac_data(searchp), 0);
+
+		if(time_after(searchp->lists.next_reap, jiffies))
+			goto next_unlock;
+
+		searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
+
+		if (searchp->lists.shared)
+			drain_array_locked(searchp, searchp->lists.shared, 0);
+
+		if (searchp->lists.free_touched) {
+			searchp->lists.free_touched = 0;
+			goto next_unlock;
+		}
+
+		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+		do {
+			p = list3_data(searchp)->slabs_free.next;
+			if (p == &(list3_data(searchp)->slabs_free))
+				break;
+
+			slabp = list_entry(p, struct slab, list);
+			BUG_ON(slabp->inuse);
+			list_del(&slabp->list);
+			STATS_INC_REAPED(searchp);
+
+			/* Safe to drop the lock. The slab is no longer
+			 * linked to the cache.
+			 * searchp cannot disappear, we hold
+			 * cache_chain_lock
+			 */
+			searchp->lists.free_objects -= searchp->num;
+			spin_unlock_irq(&searchp->spinlock);
+			slab_destroy(searchp, slabp);
+			spin_lock_irq(&searchp->spinlock);
+		} while(--tofree > 0);
+next_unlock:
+		spin_unlock_irq(&searchp->spinlock);
+next:
+		cond_resched();
+	}
+	check_irq_on();
+	up(&cache_chain_sem);
+	/* Setup the next iteration */
+	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	down(&cache_chain_sem);
+	if (!n) {
+		/*
+		 * Output format version, so at least we can change it
+		 * without _too_ many complaints.
+		 */
+#if STATS
+		seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+		seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+		seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
+		seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#if STATS
+		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
+				" <error> <maxfreeable> <freelimit> <nodeallocs>");
+		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+		seq_putc(m, '\n');
+	}
+	p = cache_chain.next;
+	while (n--) {
+		p = p->next;
+		if (p == &cache_chain)
+			return NULL;
+	}
+	return list_entry(p, kmem_cache_t, next);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	kmem_cache_t *cachep = p;
+	++*pos;
+	return cachep->next.next == &cache_chain ? NULL
+		: list_entry(cachep->next.next, kmem_cache_t, next);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	up(&cache_chain_sem);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	kmem_cache_t *cachep = p;
+	struct list_head *q;
+	struct slab	*slabp;
+	unsigned long	active_objs;
+	unsigned long	num_objs;
+	unsigned long	active_slabs = 0;
+	unsigned long	num_slabs;
+	const char *name; 
+	char *error = NULL;
+
+	check_irq_on();
+	spin_lock_irq(&cachep->spinlock);
+	active_objs = 0;
+	num_slabs = 0;
+	list_for_each(q,&cachep->lists.slabs_full) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse != cachep->num && !error)
+			error = "slabs_full accounting error";
+		active_objs += cachep->num;
+		active_slabs++;
+	}
+	list_for_each(q,&cachep->lists.slabs_partial) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse == cachep->num && !error)
+			error = "slabs_partial inuse accounting error";
+		if (!slabp->inuse && !error)
+			error = "slabs_partial/inuse accounting error";
+		active_objs += slabp->inuse;
+		active_slabs++;
+	}
+	list_for_each(q,&cachep->lists.slabs_free) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse && !error)
+			error = "slabs_free/inuse accounting error";
+		num_slabs++;
+	}
+	num_slabs+=active_slabs;
+	num_objs = num_slabs*cachep->num;
+	if (num_objs - active_objs != cachep->lists.free_objects && !error)
+		error = "free_objects accounting error";
+
+	name = cachep->name; 
+	if (error)
+		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+
+	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+		name, active_objs, num_objs, cachep->objsize,
+		cachep->num, (1<<cachep->gfporder));
+	seq_printf(m, " : tunables %4u %4u %4u",
+			cachep->limit, cachep->batchcount,
+			cachep->lists.shared->limit/cachep->batchcount);
+	seq_printf(m, " : slabdata %6lu %6lu %6u",
+			active_slabs, num_slabs, cachep->lists.shared->avail);
+#if STATS
+	{	/* list3 stats */
+		unsigned long high = cachep->high_mark;
+		unsigned long allocs = cachep->num_allocations;
+		unsigned long grown = cachep->grown;
+		unsigned long reaped = cachep->reaped;
+		unsigned long errors = cachep->errors;
+		unsigned long max_freeable = cachep->max_freeable;
+		unsigned long free_limit = cachep->free_limit;
+		unsigned long node_allocs = cachep->node_allocs;
+
+		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
+				allocs, high, grown, reaped, errors, 
+				max_freeable, free_limit, node_allocs);
+	}
+	/* cpu stats */
+	{
+		unsigned long allochit = atomic_read(&cachep->allochit);
+		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
+		unsigned long freehit = atomic_read(&cachep->freehit);
+		unsigned long freemiss = atomic_read(&cachep->freemiss);
+
+		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
+			allochit, allocmiss, freehit, freemiss);
+	}
+#endif
+	seq_putc(m, '\n');
+	spin_unlock_irq(&cachep->spinlock);
+	return 0;
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+
+struct seq_operations slabinfo_op = {
+	.start	= s_start,
+	.next	= s_next,
+	.stop	= s_stop,
+	.show	= s_show,
+};
+
+#define MAX_SLABINFO_WRITE 128
+/**
+ * slabinfo_write - Tuning for the slab allocator
+ * @file: unused
+ * @buffer: user buffer
+ * @count: data length
+ * @ppos: unused
+ */
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+	int limit, batchcount, shared, res;
+	struct list_head *p;
+	
+	if (count > MAX_SLABINFO_WRITE)
+		return -EINVAL;
+	if (copy_from_user(&kbuf, buffer, count))
+		return -EFAULT;
+	kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+
+	tmp = strchr(kbuf, ' ');
+	if (!tmp)
+		return -EINVAL;
+	*tmp = '\0';
+	tmp++;
+	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
+		return -EINVAL;
+
+	/* Find the cache in the chain of caches. */
+	down(&cache_chain_sem);
+	res = -EINVAL;
+	list_for_each(p,&cache_chain) {
+		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
+
+		if (!strcmp(cachep->name, kbuf)) {
+			if (limit < 1 ||
+			    batchcount < 1 ||
+			    batchcount > limit ||
+			    shared < 0) {
+				res = -EINVAL;
+			} else {
+				res = do_tune_cpucache(cachep, limit, batchcount, shared);
+			}
+			break;
+		}
+	}
+	up(&cache_chain_sem);
+	if (res >= 0)
+		res = count;
+	return res;
+}
+#endif
+
+unsigned int ksize(const void *objp)
+{
+	kmem_cache_t *c;
+	unsigned long flags;
+	unsigned int size = 0;
+
+	if (likely(objp != NULL)) {
+		local_irq_save(flags);
+		c = GET_PAGE_CACHE(virt_to_page(objp));
+		size = kmem_cache_size(c);
+		local_irq_restore(flags);
+	}
+
+	return size;
+}
diff --git a/mm/swap.c b/mm/swap.c
new file mode 100644
index 000000000000..7771d2803f62
--- /dev/null
+++ b/mm/swap.c
@@ -0,0 +1,485 @@
+/*
+ *  linux/mm/swap.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+/*
+ * This file contains the default values for the opereation of the
+ * Linux VM subsystem. Fine-tuning documentation can be found in
+ * Documentation/sysctl/vm.txt.
+ * Started 18.12.91
+ * Swap aging added 23.2.95, Stephen Tweedie.
+ * Buffermem limits added 12.3.98, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm_inline.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page() */
+#include <linux/module.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster;
+
+#ifdef CONFIG_HUGETLB_PAGE
+
+void put_page(struct page *page)
+{
+	if (unlikely(PageCompound(page))) {
+		page = (struct page *)page->private;
+		if (put_page_testzero(page)) {
+			void (*dtor)(struct page *page);
+
+			dtor = (void (*)(struct page *))page[1].mapping;
+			(*dtor)(page);
+		}
+		return;
+	}
+	if (!PageReserved(page) && put_page_testzero(page))
+		__page_cache_release(page);
+}
+EXPORT_SYMBOL(put_page);
+#endif
+
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim.  If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.  The page still has PageWriteback set, which will pin it.
+ *
+ * We don't expect many pages to come through here, so don't bother batching
+ * things up.
+ *
+ * To avoid placing the page at the tail of the LRU while PG_writeback is still
+ * set, this function will clear PG_writeback before performing the page
+ * motion.  Do that inside the lru lock because once PG_writeback is cleared
+ * we may not touch the page.
+ *
+ * Returns zero if it cleared PG_writeback.
+ */
+int rotate_reclaimable_page(struct page *page)
+{
+	struct zone *zone;
+	unsigned long flags;
+
+	if (PageLocked(page))
+		return 1;
+	if (PageDirty(page))
+		return 1;
+	if (PageActive(page))
+		return 1;
+	if (!PageLRU(page))
+		return 1;
+
+	zone = page_zone(page);
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	if (PageLRU(page) && !PageActive(page)) {
+		list_del(&page->lru);
+		list_add_tail(&page->lru, &zone->inactive_list);
+		inc_page_state(pgrotated);
+	}
+	if (!test_clear_page_writeback(page))
+		BUG();
+	spin_unlock_irqrestore(&zone->lru_lock, flags);
+	return 0;
+}
+
+/*
+ * FIXME: speed this up?
+ */
+void fastcall activate_page(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	if (PageLRU(page) && !PageActive(page)) {
+		del_page_from_inactive_list(zone, page);
+		SetPageActive(page);
+		add_page_to_active_list(zone, page);
+		inc_page_state(pgactivate);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * inactive,unreferenced	->	inactive,referenced
+ * inactive,referenced		->	active,unreferenced
+ * active,unreferenced		->	active,referenced
+ */
+void fastcall mark_page_accessed(struct page *page)
+{
+	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+		activate_page(page);
+		ClearPageReferenced(page);
+	} else if (!PageReferenced(page)) {
+		SetPageReferenced(page);
+	}
+}
+
+EXPORT_SYMBOL(mark_page_accessed);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+
+void fastcall lru_cache_add(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
+void fastcall lru_cache_add_active(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add_active(pvec);
+	put_cpu_var(lru_add_active_pvecs);
+}
+
+void lru_add_drain(void)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+	if (pagevec_count(pvec))
+		__pagevec_lru_add(pvec);
+	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_active(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
+ */
+void fastcall __page_cache_release(struct page *page)
+{
+	unsigned long flags;
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	if (TestClearPageLRU(page))
+		del_page_from_lru(zone, page);
+	if (page_count(page) != 0)
+		page = NULL;
+	spin_unlock_irqrestore(&zone->lru_lock, flags);
+	if (page)
+		free_hot_page(page);
+}
+
+EXPORT_SYMBOL(__page_cache_release);
+
+/*
+ * Batched page_cache_release().  Decrement the reference count on all the
+ * passed pages.  If it fell to zero then remove the page from the LRU and
+ * free it.
+ *
+ * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
+ * for the remainder of the operation.
+ *
+ * The locking in this function is against shrink_cache(): we recheck the
+ * page count inside the lock to see whether shrink_cache grabbed the page
+ * via the LRU.  If it did, give up: shrink_cache will free it.
+ */
+void release_pages(struct page **pages, int nr, int cold)
+{
+	int i;
+	struct pagevec pages_to_free;
+	struct zone *zone = NULL;
+
+	pagevec_init(&pages_to_free, cold);
+	for (i = 0; i < nr; i++) {
+		struct page *page = pages[i];
+		struct zone *pagezone;
+
+		if (PageReserved(page) || !put_page_testzero(page))
+			continue;
+
+		pagezone = page_zone(page);
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestClearPageLRU(page))
+			del_page_from_lru(zone, page);
+		if (page_count(page) == 0) {
+			if (!pagevec_add(&pages_to_free, page)) {
+				spin_unlock_irq(&zone->lru_lock);
+				__pagevec_free(&pages_to_free);
+				pagevec_reinit(&pages_to_free);
+				zone = NULL;	/* No lock is held */
+			}
+		}
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+
+	pagevec_free(&pages_to_free);
+}
+
+/*
+ * The pages which we're about to release may be in the deferred lru-addition
+ * queues.  That would prevent them from really being freed right now.  That's
+ * OK from a correctness point of view but is inefficient - those pages may be
+ * cache-warm and we want to give them back to the page allocator ASAP.
+ *
+ * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
+ * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * mutual recursion.
+ */
+void __pagevec_release(struct pagevec *pvec)
+{
+	lru_add_drain();
+	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * pagevec_release() for pages which are known to not be on the LRU
+ *
+ * This function reinitialises the caller's pagevec.
+ */
+void __pagevec_release_nonlru(struct pagevec *pvec)
+{
+	int i;
+	struct pagevec pages_to_free;
+
+	pagevec_init(&pages_to_free, pvec->cold);
+	pages_to_free.cold = pvec->cold;
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		BUG_ON(PageLRU(page));
+		if (put_page_testzero(page))
+			pagevec_add(&pages_to_free, page);
+	}
+	pagevec_free(&pages_to_free);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them.  Reinitialises the caller's pagevec.
+ */
+void __pagevec_lru_add(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		add_page_to_inactive_list(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+EXPORT_SYMBOL(__pagevec_lru_add);
+
+void __pagevec_lru_add_active(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		if (TestSetPageActive(page))
+			BUG();
+		add_page_to_active_list(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * Try to drop buffers from the pages in a pagevec
+ */
+void pagevec_strip(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		if (PagePrivate(page) && !TestSetPageLocked(page)) {
+			try_to_release_page(page, 0);
+			unlock_page(page);
+		}
+	}
+}
+
+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec:	Where the resulting pages are placed
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
+unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t *index, int tag, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_tag(mapping, index, tag,
+					nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
+
+#ifdef CONFIG_SMP
+/*
+ * We tolerate a little inaccuracy to avoid ping-ponging the counter between
+ * CPUs
+ */
+#define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
+
+static DEFINE_PER_CPU(long, committed_space) = 0;
+
+void vm_acct_memory(long pages)
+{
+	long *local;
+
+	preempt_disable();
+	local = &__get_cpu_var(committed_space);
+	*local += pages;
+	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
+		atomic_add(*local, &vm_committed_space);
+		*local = 0;
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(vm_acct_memory);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void lru_drain_cache(unsigned int cpu)
+{
+	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+
+	/* CPU is dead, so no locking needed. */
+	if (pagevec_count(pvec))
+		__pagevec_lru_add(pvec);
+	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_active(pvec);
+}
+
+/* Drop the CPU's cached committed space back into the central pool. */
+static int cpu_swap_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	long *committed;
+
+	committed = &per_cpu(committed_space, (long)hcpu);
+	if (action == CPU_DEAD) {
+		atomic_add(*committed, &vm_committed_space);
+		*committed = 0;
+		lru_drain_cache((long)hcpu);
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+void percpu_counter_mod(struct percpu_counter *fbc, long amount)
+{
+	long count;
+	long *pcount;
+	int cpu = get_cpu();
+
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		spin_unlock(&fbc->lock);
+		count = 0;
+	}
+	*pcount = count;
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_counter_mod);
+#endif
+
+/*
+ * Perform any setup for the swap system
+ */
+void __init swap_setup(void)
+{
+	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+
+	/* Use a smaller cluster for small-memory machines */
+	if (megs < 16)
+		page_cluster = 2;
+	else
+		page_cluster = 3;
+	/*
+	 * Right now other parts of the system means that we
+	 * _really_ don't want to cluster much more
+	 */
+	hotcpu_notifier(cpu_swap_callback, 0);
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
new file mode 100644
index 000000000000..a063a902ed03
--- /dev/null
+++ b/mm/swap_state.c
@@ -0,0 +1,382 @@
+/*
+ *  linux/mm/swap_state.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_list, to make sync_page look nicer, and to allow
+ * future use of radix_tree tags in the swap cache.
+ */
+static struct address_space_operations swap_aops = {
+	.writepage	= swap_writepage,
+	.sync_page	= block_sync_page,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info swap_backing_dev_info = {
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.unplug_io_fn	= swap_unplug_io_fn,
+};
+
+struct address_space swapper_space = {
+	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+	.tree_lock	= RW_LOCK_UNLOCKED,
+	.a_ops		= &swap_aops,
+	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+	.backing_dev_info = &swap_backing_dev_info,
+};
+EXPORT_SYMBOL(swapper_space);
+
+#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
+
+static struct {
+	unsigned long add_total;
+	unsigned long del_total;
+	unsigned long find_success;
+	unsigned long find_total;
+	unsigned long noent_race;
+	unsigned long exist_race;
+} swap_cache_info;
+
+void show_swap_cache_info(void)
+{
+	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+		swap_cache_info.add_total, swap_cache_info.del_total,
+		swap_cache_info.find_success, swap_cache_info.find_total,
+		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+}
+
+/*
+ * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+static int __add_to_swap_cache(struct page *page,
+		swp_entry_t entry, int gfp_mask)
+{
+	int error;
+
+	BUG_ON(PageSwapCache(page));
+	BUG_ON(PagePrivate(page));
+	error = radix_tree_preload(gfp_mask);
+	if (!error) {
+		write_lock_irq(&swapper_space.tree_lock);
+		error = radix_tree_insert(&swapper_space.page_tree,
+						entry.val, page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			SetPageSwapCache(page);
+			page->private = entry.val;
+			total_swapcache_pages++;
+			pagecache_acct(1);
+		}
+		write_unlock_irq(&swapper_space.tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+
+static int add_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	int error;
+
+	if (!swap_duplicate(entry)) {
+		INC_CACHE_INFO(noent_race);
+		return -ENOENT;
+	}
+	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
+	/*
+	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
+	 */
+	if (error) {
+		swap_free(entry);
+		if (error == -EEXIST)
+			INC_CACHE_INFO(exist_race);
+		return error;
+	}
+	INC_CACHE_INFO(add_total);
+	return 0;
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void __delete_from_swap_cache(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageSwapCache(page));
+	BUG_ON(PageWriteback(page));
+
+	radix_tree_delete(&swapper_space.page_tree, page->private);
+	page->private = 0;
+	ClearPageSwapCache(page);
+	total_swapcache_pages--;
+	pagecache_acct(-1);
+	INC_CACHE_INFO(del_total);
+}
+
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock. 
+ */
+int add_to_swap(struct page * page)
+{
+	swp_entry_t entry;
+	int pf_flags;
+	int err;
+
+	if (!PageLocked(page))
+		BUG();
+
+	for (;;) {
+		entry = get_swap_page();
+		if (!entry.val)
+			return 0;
+
+		/* Radix-tree node allocations are performing
+		 * GFP_ATOMIC allocations under PF_MEMALLOC.  
+		 * They can completely exhaust the page allocator.  
+		 *
+		 * So PF_MEMALLOC is dropped here.  This causes the slab 
+		 * allocations to fail earlier, so radix-tree nodes will 
+		 * then be allocated from the mempool reserves.
+		 *
+		 * We're still using __GFP_HIGH for radix-tree node
+		 * allocations, so some of the emergency pools are available,
+		 * just not all of them.
+		 */
+
+		pf_flags = current->flags;
+		current->flags &= ~PF_MEMALLOC;
+
+		/*
+		 * Add it to the swap cache and mark it dirty
+		 */
+		err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN);
+
+		if (pf_flags & PF_MEMALLOC)
+			current->flags |= PF_MEMALLOC;
+
+		switch (err) {
+		case 0:				/* Success */
+			SetPageUptodate(page);
+			SetPageDirty(page);
+			INC_CACHE_INFO(add_total);
+			return 1;
+		case -EEXIST:
+			/* Raced with "speculative" read_swap_cache_async */
+			INC_CACHE_INFO(exist_race);
+			swap_free(entry);
+			continue;
+		default:
+			/* -ENOMEM radix-tree allocation failure */
+			swap_free(entry);
+			return 0;
+		}
+	}
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+	swp_entry_t entry;
+
+	BUG_ON(!PageSwapCache(page));
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+	BUG_ON(PagePrivate(page));
+  
+	entry.val = page->private;
+
+	write_lock_irq(&swapper_space.tree_lock);
+	__delete_from_swap_cache(page);
+	write_unlock_irq(&swapper_space.tree_lock);
+
+	swap_free(entry);
+	page_cache_release(page);
+}
+
+/*
+ * Strange swizzling function only for use by shmem_writepage
+ */
+int move_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
+	if (!err) {
+		remove_from_page_cache(page);
+		page_cache_release(page);	/* pagecache ref */
+		if (!swap_duplicate(entry))
+			BUG();
+		SetPageDirty(page);
+		INC_CACHE_INFO(add_total);
+	} else if (err == -EEXIST)
+		INC_CACHE_INFO(exist_race);
+	return err;
+}
+
+/*
+ * Strange swizzling function for shmem_getpage (and shmem_unuse)
+ */
+int move_from_swap_cache(struct page *page, unsigned long index,
+		struct address_space *mapping)
+{
+	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
+	if (!err) {
+		delete_from_swap_cache(page);
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
+		set_page_dirty(page);
+	}
+	return err;
+}
+
+/* 
+ * If we are the only user, then try to free up the swap cache. 
+ * 
+ * Its ok to check for PageSwapCache without the page lock
+ * here because we are going to recheck again inside 
+ * exclusive_swap_page() _with_ the lock. 
+ * 					- Marcelo
+ */
+static inline void free_swap_cache(struct page *page)
+{
+	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+		remove_exclusive_swap_page(page);
+		unlock_page(page);
+	}
+}
+
+/* 
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page. Can not do a lock_page,
+ * as we are holding the page_table_lock spinlock.
+ */
+void free_page_and_swap_cache(struct page *page)
+{
+	free_swap_cache(page);
+	page_cache_release(page);
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them.  They are removed from the LRU and freed if this is their last use.
+ */
+void free_pages_and_swap_cache(struct page **pages, int nr)
+{
+	int chunk = 16;
+	struct page **pagep = pages;
+
+	lru_add_drain();
+	while (nr) {
+		int todo = min(chunk, nr);
+		int i;
+
+		for (i = 0; i < todo; i++)
+			free_swap_cache(pagep[i]);
+		release_pages(pagep, todo, 0);
+		pagep += todo;
+		nr -= todo;
+	}
+}
+
+/*
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
+ */
+struct page * lookup_swap_cache(swp_entry_t entry)
+{
+	struct page *page;
+
+	page = find_get_page(&swapper_space, entry.val);
+
+	if (page)
+		INC_CACHE_INFO(find_success);
+
+	INC_CACHE_INFO(find_total);
+	return page;
+}
+
+/* 
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	struct page *found_page, *new_page = NULL;
+	int err;
+
+	do {
+		/*
+		 * First check the swap cache.  Since this is normally
+		 * called after lookup_swap_cache() failed, re-calling
+		 * that would confuse statistics.
+		 */
+		found_page = find_get_page(&swapper_space, entry.val);
+		if (found_page)
+			break;
+
+		/*
+		 * Get a new page to read into from swap.
+		 */
+		if (!new_page) {
+			new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+			if (!new_page)
+				break;		/* Out of memory */
+		}
+
+		/*
+		 * Associate the page with swap entry in the swap cache.
+		 * May fail (-ENOENT) if swap entry has been freed since
+		 * our caller observed it.  May fail (-EEXIST) if there
+		 * is already a page associated with this entry in the
+		 * swap cache: added by a racing read_swap_cache_async,
+		 * or by try_to_swap_out (or shmem_writepage) re-using
+		 * the just freed swap entry for an existing page.
+		 * May fail (-ENOMEM) if radix-tree node allocation failed.
+		 */
+		err = add_to_swap_cache(new_page, entry);
+		if (!err) {
+			/*
+			 * Initiate read into locked page and return.
+			 */
+			lru_cache_add_active(new_page);
+			swap_readpage(NULL, new_page);
+			return new_page;
+		}
+	} while (err != -ENOENT && err != -ENOMEM);
+
+	if (new_page)
+		page_cache_release(new_page);
+	return found_page;
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
new file mode 100644
index 000000000000..a60e0075d55b
--- /dev/null
+++ b/mm/swapfile.c
@@ -0,0 +1,1672 @@
+/*
+ *  linux/mm/swapfile.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/shm.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/backing-dev.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/swapops.h>
+
+DEFINE_SPINLOCK(swaplock);
+unsigned int nr_swapfiles;
+long total_swap_pages;
+static int swap_overflow;
+
+EXPORT_SYMBOL(total_swap_pages);
+
+static const char Bad_file[] = "Bad swap file entry ";
+static const char Unused_file[] = "Unused swap file entry ";
+static const char Bad_offset[] = "Bad swap offset entry ";
+static const char Unused_offset[] = "Unused swap offset entry ";
+
+struct swap_list_t swap_list = {-1, -1};
+
+struct swap_info_struct swap_info[MAX_SWAPFILES];
+
+static DECLARE_MUTEX(swapon_sem);
+
+/*
+ * We need this because the bdev->unplug_fn can sleep and we cannot
+ * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * cannot be turned into a semaphore.
+ */
+static DECLARE_RWSEM(swap_unplug_sem);
+
+#define SWAPFILE_CLUSTER 256
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
+{
+	swp_entry_t entry;
+
+	down_read(&swap_unplug_sem);
+	entry.val = page->private;
+	if (PageSwapCache(page)) {
+		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+		struct backing_dev_info *bdi;
+
+		/*
+		 * If the page is removed from swapcache from under us (with a
+		 * racy try_to_unuse/swapoff) we need an additional reference
+		 * count to avoid reading garbage from page->private above. If
+		 * the WARN_ON triggers during a swapoff it maybe the race
+		 * condition and it's harmless. However if it triggers without
+		 * swapoff it signals a problem.
+		 */
+		WARN_ON(page_count(page) <= 1);
+
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		bdi->unplug_io_fn(bdi, page);
+	}
+	up_read(&swap_unplug_sem);
+}
+
+static inline int scan_swap_map(struct swap_info_struct *si)
+{
+	unsigned long offset;
+	/* 
+	 * We try to cluster swap pages by allocating them
+	 * sequentially in swap.  Once we've allocated
+	 * SWAPFILE_CLUSTER pages this way, however, we resort to
+	 * first-free allocation, starting a new cluster.  This
+	 * prevents us from scattering swap pages all over the entire
+	 * swap partition, so that we reduce overall disk seek times
+	 * between swap pages.  -- sct */
+	if (si->cluster_nr) {
+		while (si->cluster_next <= si->highest_bit) {
+			offset = si->cluster_next++;
+			if (si->swap_map[offset])
+				continue;
+			si->cluster_nr--;
+			goto got_page;
+		}
+	}
+	si->cluster_nr = SWAPFILE_CLUSTER;
+
+	/* try to find an empty (even not aligned) cluster. */
+	offset = si->lowest_bit;
+ check_next_cluster:
+	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
+	{
+		unsigned long nr;
+		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
+			if (si->swap_map[nr])
+			{
+				offset = nr+1;
+				goto check_next_cluster;
+			}
+		/* We found a completly empty cluster, so start
+		 * using it.
+		 */
+		goto got_page;
+	}
+	/* No luck, so now go finegrined as usual. -Andrea */
+	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
+		if (si->swap_map[offset])
+			continue;
+		si->lowest_bit = offset+1;
+	got_page:
+		if (offset == si->lowest_bit)
+			si->lowest_bit++;
+		if (offset == si->highest_bit)
+			si->highest_bit--;
+		if (si->lowest_bit > si->highest_bit) {
+			si->lowest_bit = si->max;
+			si->highest_bit = 0;
+		}
+		si->swap_map[offset] = 1;
+		si->inuse_pages++;
+		nr_swap_pages--;
+		si->cluster_next = offset+1;
+		return offset;
+	}
+	si->lowest_bit = si->max;
+	si->highest_bit = 0;
+	return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	struct swap_info_struct * p;
+	unsigned long offset;
+	swp_entry_t entry;
+	int type, wrapped = 0;
+
+	entry.val = 0;	/* Out of memory */
+	swap_list_lock();
+	type = swap_list.next;
+	if (type < 0)
+		goto out;
+	if (nr_swap_pages <= 0)
+		goto out;
+
+	while (1) {
+		p = &swap_info[type];
+		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+			swap_device_lock(p);
+			offset = scan_swap_map(p);
+			swap_device_unlock(p);
+			if (offset) {
+				entry = swp_entry(type,offset);
+				type = swap_info[type].next;
+				if (type < 0 ||
+					p->prio != swap_info[type].prio) {
+						swap_list.next = swap_list.head;
+				} else {
+					swap_list.next = type;
+				}
+				goto out;
+			}
+		}
+		type = p->next;
+		if (!wrapped) {
+			if (type < 0 || p->prio != swap_info[type].prio) {
+				type = swap_list.head;
+				wrapped = 1;
+			}
+		} else
+			if (type < 0)
+				goto out;	/* out of swap space */
+	}
+out:
+	swap_list_unlock();
+	return entry;
+}
+
+static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+
+	if (!entry.val)
+		goto out;
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		goto bad_nofile;
+	p = & swap_info[type];
+	if (!(p->flags & SWP_USED))
+		goto bad_device;
+	offset = swp_offset(entry);
+	if (offset >= p->max)
+		goto bad_offset;
+	if (!p->swap_map[offset])
+		goto bad_free;
+	swap_list_lock();
+	if (p->prio > swap_info[swap_list.next].prio)
+		swap_list.next = type;
+	swap_device_lock(p);
+	return p;
+
+bad_free:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+bad_offset:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+	goto out;
+bad_device:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+	goto out;
+bad_nofile:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+out:
+	return NULL;
+}	
+
+static void swap_info_put(struct swap_info_struct * p)
+{
+	swap_device_unlock(p);
+	swap_list_unlock();
+}
+
+static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+{
+	int count = p->swap_map[offset];
+
+	if (count < SWAP_MAP_MAX) {
+		count--;
+		p->swap_map[offset] = count;
+		if (!count) {
+			if (offset < p->lowest_bit)
+				p->lowest_bit = offset;
+			if (offset > p->highest_bit)
+				p->highest_bit = offset;
+			nr_swap_pages++;
+			p->inuse_pages--;
+		}
+	}
+	return count;
+}
+
+/*
+ * Caller has made sure that the swapdevice corresponding to entry
+ * is still around or has not been recycled.
+ */
+void swap_free(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+
+	p = swap_info_get(entry);
+	if (p) {
+		swap_entry_free(p, swp_offset(entry));
+		swap_info_put(p);
+	}
+}
+
+/*
+ * Check if we're the only user of a swap page,
+ * when the page is locked.
+ */
+static int exclusive_swap_page(struct page *page)
+{
+	int retval = 0;
+	struct swap_info_struct * p;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+	p = swap_info_get(entry);
+	if (p) {
+		/* Is the only swap cache user the cache itself? */
+		if (p->swap_map[swp_offset(entry)] == 1) {
+			/* Recheck the page count with the swapcache lock held.. */
+			write_lock_irq(&swapper_space.tree_lock);
+			if (page_count(page) == 2)
+				retval = 1;
+			write_unlock_irq(&swapper_space.tree_lock);
+		}
+		swap_info_put(p);
+	}
+	return retval;
+}
+
+/*
+ * We can use this swap cache entry directly
+ * if there are no other references to it.
+ *
+ * Here "exclusive_swap_page()" does the real
+ * work, but we opportunistically check whether
+ * we need to get all the locks first..
+ */
+int can_share_swap_page(struct page *page)
+{
+	int retval = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	switch (page_count(page)) {
+	case 3:
+		if (!PagePrivate(page))
+			break;
+		/* Fallthrough */
+	case 2:
+		if (!PageSwapCache(page))
+			break;
+		retval = exclusive_swap_page(page);
+		break;
+	case 1:
+		if (PageReserved(page))
+			break;
+		retval = 1;
+	}
+	return retval;
+}
+
+/*
+ * Work out if there are any other processes sharing this
+ * swap cache page. Free it if you can. Return success.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+	int retval;
+	struct swap_info_struct * p;
+	swp_entry_t entry;
+
+	BUG_ON(PagePrivate(page));
+	BUG_ON(!PageLocked(page));
+
+	if (!PageSwapCache(page))
+		return 0;
+	if (PageWriteback(page))
+		return 0;
+	if (page_count(page) != 2) /* 2: us + cache */
+		return 0;
+
+	entry.val = page->private;
+	p = swap_info_get(entry);
+	if (!p)
+		return 0;
+
+	/* Is the only swap cache user the cache itself? */
+	retval = 0;
+	if (p->swap_map[swp_offset(entry)] == 1) {
+		/* Recheck the page count with the swapcache lock held.. */
+		write_lock_irq(&swapper_space.tree_lock);
+		if ((page_count(page) == 2) && !PageWriteback(page)) {
+			__delete_from_swap_cache(page);
+			SetPageDirty(page);
+			retval = 1;
+		}
+		write_unlock_irq(&swapper_space.tree_lock);
+	}
+	swap_info_put(p);
+
+	if (retval) {
+		swap_free(entry);
+		page_cache_release(page);
+	}
+
+	return retval;
+}
+
+/*
+ * Free the swap entry like above, but also try to
+ * free the page cache entry if it is the last user.
+ */
+void free_swap_and_cache(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	struct page *page = NULL;
+
+	p = swap_info_get(entry);
+	if (p) {
+		if (swap_entry_free(p, swp_offset(entry)) == 1)
+			page = find_trylock_page(&swapper_space, entry.val);
+		swap_info_put(p);
+	}
+	if (page) {
+		int one_user;
+
+		BUG_ON(PagePrivate(page));
+		page_cache_get(page);
+		one_user = (page_count(page) == 2);
+		/* Only cache user (+us), or swap space full? Free it! */
+		if (!PageWriteback(page) && (one_user || vm_swap_full())) {
+			delete_from_swap_cache(page);
+			SetPageDirty(page);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited).  We don't know just how many PTEs will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
+ *
+ * vma->vm_mm->page_table_lock is held.
+ */
+static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+		unsigned long addr, swp_entry_t entry, struct page *page)
+{
+	inc_mm_counter(vma->vm_mm, rss);
+	get_page(page);
+	set_pte_at(vma->vm_mm, addr, pte,
+		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	page_add_anon_rmap(page, vma, addr);
+	swap_free(entry);
+	/*
+	 * Move the page to the active list so it is not
+	 * immediately swapped out again after swapon.
+	 */
+	activate_page(page);
+}
+
+static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pte_t *pte;
+	pte_t swp_pte = swp_entry_to_pte(entry);
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		/*
+		 * swapoff spends a _lot_ of time in this loop!
+		 * Test inline before going to call unuse_pte.
+		 */
+		if (unlikely(pte_same(*pte, swp_pte))) {
+			unuse_pte(vma, pte, addr, entry, page);
+			pte_unmap(pte);
+			return 1;
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+	return 0;
+}
+
+static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		if (unuse_pte_range(vma, pmd, addr, next, entry, page))
+			return 1;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		if (unuse_pmd_range(vma, pud, addr, next, entry, page))
+			return 1;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+static int unuse_vma(struct vm_area_struct *vma,
+				swp_entry_t entry, struct page *page)
+{
+	pgd_t *pgd;
+	unsigned long addr, end, next;
+
+	if (page->mapping) {
+		addr = page_address_in_vma(page, vma);
+		if (addr == -EFAULT)
+			return 0;
+		else
+			end = addr + PAGE_SIZE;
+	} else {
+		addr = vma->vm_start;
+		end = vma->vm_end;
+	}
+
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		if (unuse_pud_range(vma, pgd, addr, next, entry, page))
+			return 1;
+	} while (pgd++, addr = next, addr != end);
+	return 0;
+}
+
+static int unuse_mm(struct mm_struct *mm,
+				swp_entry_t entry, struct page *page)
+{
+	struct vm_area_struct *vma;
+
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		/*
+		 * Our reference to the page stops try_to_unmap_one from
+		 * unmapping its ptes, so swapoff can make progress.
+		 */
+		unlock_page(page);
+		down_read(&mm->mmap_sem);
+		lock_page(page);
+	}
+	spin_lock(&mm->page_table_lock);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->anon_vma && unuse_vma(vma, entry, page))
+			break;
+	}
+	spin_unlock(&mm->page_table_lock);
+	up_read(&mm->mmap_sem);
+	/*
+	 * Currently unuse_mm cannot fail, but leave error handling
+	 * at call sites for now, since we change it from time to time.
+	 */
+	return 0;
+}
+
+/*
+ * Scan swap_map from current position to next entry still in use.
+ * Recycle to start on reaching the end, returning 0 when empty.
+ */
+static int find_next_to_unuse(struct swap_info_struct *si, int prev)
+{
+	int max = si->max;
+	int i = prev;
+	int count;
+
+	/*
+	 * No need for swap_device_lock(si) here: we're just looking
+	 * for whether an entry is in use, not modifying it; false
+	 * hits are okay, and sys_swapoff() has already prevented new
+	 * allocations from this area (while holding swap_list_lock()).
+	 */
+	for (;;) {
+		if (++i >= max) {
+			if (!prev) {
+				i = 0;
+				break;
+			}
+			/*
+			 * No entries in use at top of swap_map,
+			 * loop back to start and recheck there.
+			 */
+			max = prev + 1;
+			prev = 0;
+			i = 1;
+		}
+		count = si->swap_map[i];
+		if (count && count != SWAP_MAP_BAD)
+			break;
+	}
+	return i;
+}
+
+/*
+ * We completely avoid races by reading each swap page in advance,
+ * and then search for the process using it.  All the necessary
+ * page table adjustments can then be made atomically.
+ */
+static int try_to_unuse(unsigned int type)
+{
+	struct swap_info_struct * si = &swap_info[type];
+	struct mm_struct *start_mm;
+	unsigned short *swap_map;
+	unsigned short swcount;
+	struct page *page;
+	swp_entry_t entry;
+	int i = 0;
+	int retval = 0;
+	int reset_overflow = 0;
+	int shmem;
+
+	/*
+	 * When searching mms for an entry, a good strategy is to
+	 * start at the first mm we freed the previous entry from
+	 * (though actually we don't notice whether we or coincidence
+	 * freed the entry).  Initialize this start_mm with a hold.
+	 *
+	 * A simpler strategy would be to start at the last mm we
+	 * freed the previous entry from; but that would take less
+	 * advantage of mmlist ordering, which clusters forked mms
+	 * together, child after parent.  If we race with dup_mmap(), we
+	 * prefer to resolve parent before child, lest we miss entries
+	 * duplicated after we scanned child: using last mm would invert
+	 * that.  Though it's only a serious concern when an overflowed
+	 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+	 */
+	start_mm = &init_mm;
+	atomic_inc(&init_mm.mm_users);
+
+	/*
+	 * Keep on scanning until all entries have gone.  Usually,
+	 * one pass through swap_map is enough, but not necessarily:
+	 * there are races when an instance of an entry might be missed.
+	 */
+	while ((i = find_next_to_unuse(si, i)) != 0) {
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			break;
+		}
+
+		/* 
+		 * Get a page for the entry, using the existing swap
+		 * cache page if there is one.  Otherwise, get a clean
+		 * page and read the swap into it. 
+		 */
+		swap_map = &si->swap_map[i];
+		entry = swp_entry(type, i);
+		page = read_swap_cache_async(entry, NULL, 0);
+		if (!page) {
+			/*
+			 * Either swap_duplicate() failed because entry
+			 * has been freed independently, and will not be
+			 * reused since sys_swapoff() already disabled
+			 * allocation from here, or alloc_page() failed.
+			 */
+			if (!*swap_map)
+				continue;
+			retval = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * Don't hold on to start_mm if it looks like exiting.
+		 */
+		if (atomic_read(&start_mm->mm_users) == 1) {
+			mmput(start_mm);
+			start_mm = &init_mm;
+			atomic_inc(&init_mm.mm_users);
+		}
+
+		/*
+		 * Wait for and lock page.  When do_swap_page races with
+		 * try_to_unuse, do_swap_page can handle the fault much
+		 * faster than try_to_unuse can locate the entry.  This
+		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
+		 * defer to do_swap_page in such a case - in some tests,
+		 * do_swap_page and try_to_unuse repeatedly compete.
+		 */
+		wait_on_page_locked(page);
+		wait_on_page_writeback(page);
+		lock_page(page);
+		wait_on_page_writeback(page);
+
+		/*
+		 * Remove all references to entry.
+		 * Whenever we reach init_mm, there's no address space
+		 * to search, but use it as a reminder to search shmem.
+		 */
+		shmem = 0;
+		swcount = *swap_map;
+		if (swcount > 1) {
+			if (start_mm == &init_mm)
+				shmem = shmem_unuse(entry, page);
+			else
+				retval = unuse_mm(start_mm, entry, page);
+		}
+		if (*swap_map > 1) {
+			int set_start_mm = (*swap_map >= swcount);
+			struct list_head *p = &start_mm->mmlist;
+			struct mm_struct *new_start_mm = start_mm;
+			struct mm_struct *prev_mm = start_mm;
+			struct mm_struct *mm;
+
+			atomic_inc(&new_start_mm->mm_users);
+			atomic_inc(&prev_mm->mm_users);
+			spin_lock(&mmlist_lock);
+			while (*swap_map > 1 && !retval &&
+					(p = p->next) != &start_mm->mmlist) {
+				mm = list_entry(p, struct mm_struct, mmlist);
+				if (atomic_inc_return(&mm->mm_users) == 1) {
+					atomic_dec(&mm->mm_users);
+					continue;
+				}
+				spin_unlock(&mmlist_lock);
+				mmput(prev_mm);
+				prev_mm = mm;
+
+				cond_resched();
+
+				swcount = *swap_map;
+				if (swcount <= 1)
+					;
+				else if (mm == &init_mm) {
+					set_start_mm = 1;
+					shmem = shmem_unuse(entry, page);
+				} else
+					retval = unuse_mm(mm, entry, page);
+				if (set_start_mm && *swap_map < swcount) {
+					mmput(new_start_mm);
+					atomic_inc(&mm->mm_users);
+					new_start_mm = mm;
+					set_start_mm = 0;
+				}
+				spin_lock(&mmlist_lock);
+			}
+			spin_unlock(&mmlist_lock);
+			mmput(prev_mm);
+			mmput(start_mm);
+			start_mm = new_start_mm;
+		}
+		if (retval) {
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+
+		/*
+		 * How could swap count reach 0x7fff when the maximum
+		 * pid is 0x7fff, and there's no way to repeat a swap
+		 * page within an mm (except in shmem, where it's the
+		 * shared object which takes the reference count)?
+		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+		 *
+		 * If that's wrong, then we should worry more about
+		 * exit_mmap() and do_munmap() cases described above:
+		 * we might be resetting SWAP_MAP_MAX too early here.
+		 * We know "Undead"s can happen, they're okay, so don't
+		 * report them; but do report if we reset SWAP_MAP_MAX.
+		 */
+		if (*swap_map == SWAP_MAP_MAX) {
+			swap_device_lock(si);
+			*swap_map = 1;
+			swap_device_unlock(si);
+			reset_overflow = 1;
+		}
+
+		/*
+		 * If a reference remains (rare), we would like to leave
+		 * the page in the swap cache; but try_to_unmap could
+		 * then re-duplicate the entry once we drop page lock,
+		 * so we might loop indefinitely; also, that page could
+		 * not be swapped out to other storage meanwhile.  So:
+		 * delete from cache even if there's another reference,
+		 * after ensuring that the data has been saved to disk -
+		 * since if the reference remains (rarer), it will be
+		 * read from disk into another page.  Splitting into two
+		 * pages would be incorrect if swap supported "shared
+		 * private" pages, but they are handled by tmpfs files.
+		 *
+		 * Note shmem_unuse already deleted a swappage from
+		 * the swap cache, unless the move to filepage failed:
+		 * in which case it left swappage in cache, lowered its
+		 * swap count to pass quickly through the loops above,
+		 * and now we must reincrement count to try again later.
+		 */
+		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+			struct writeback_control wbc = {
+				.sync_mode = WB_SYNC_NONE,
+			};
+
+			swap_writepage(page, &wbc);
+			lock_page(page);
+			wait_on_page_writeback(page);
+		}
+		if (PageSwapCache(page)) {
+			if (shmem)
+				swap_duplicate(entry);
+			else
+				delete_from_swap_cache(page);
+		}
+
+		/*
+		 * So we could skip searching mms once swap count went
+		 * to 1, we did not mark any present ptes as dirty: must
+		 * mark page dirty so shrink_list will preserve it.
+		 */
+		SetPageDirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		/*
+		 * Make sure that we aren't completely killing
+		 * interactive performance.
+		 */
+		cond_resched();
+	}
+
+	mmput(start_mm);
+	if (reset_overflow) {
+		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
+		swap_overflow = 0;
+	}
+	return retval;
+}
+
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know we
+ * can empty the mmlist.  swap_list_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+	struct list_head *p, *next;
+	unsigned int i;
+
+	for (i = 0; i < nr_swapfiles; i++)
+		if (swap_info[i].inuse_pages)
+			return;
+	spin_lock(&mmlist_lock);
+	list_for_each_safe(p, next, &init_mm.mmlist)
+		list_del_init(p);
+	spin_unlock(&mmlist_lock);
+}
+
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset `offset'.
+ */
+sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+{
+	struct swap_extent *se = sis->curr_swap_extent;
+	struct swap_extent *start_se = se;
+
+	for ( ; ; ) {
+		struct list_head *lh;
+
+		if (se->start_page <= offset &&
+				offset < (se->start_page + se->nr_pages)) {
+			return se->start_block + (offset - se->start_page);
+		}
+		lh = se->list.prev;
+		if (lh == &sis->extent_list)
+			lh = lh->prev;
+		se = list_entry(lh, struct swap_extent, list);
+		sis->curr_swap_extent = se;
+		BUG_ON(se == start_se);		/* It *must* be present */
+	}
+}
+
+/*
+ * Free all of a swapdev's extent information
+ */
+static void destroy_swap_extents(struct swap_info_struct *sis)
+{
+	while (!list_empty(&sis->extent_list)) {
+		struct swap_extent *se;
+
+		se = list_entry(sis->extent_list.next,
+				struct swap_extent, list);
+		list_del(&se->list);
+		kfree(se);
+	}
+	sis->nr_extents = 0;
+}
+
+/*
+ * Add a block range (and the corresponding page range) into this swapdev's
+ * extent list.  The extent list is kept sorted in block order.
+ *
+ * This function rather assumes that it is called in ascending sector_t order.
+ * It doesn't look for extent coalescing opportunities.
+ */
+static int
+add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
+		unsigned long nr_pages, sector_t start_block)
+{
+	struct swap_extent *se;
+	struct swap_extent *new_se;
+	struct list_head *lh;
+
+	lh = sis->extent_list.next;	/* The highest-addressed block */
+	while (lh != &sis->extent_list) {
+		se = list_entry(lh, struct swap_extent, list);
+		if (se->start_block + se->nr_pages == start_block &&
+		    se->start_page  + se->nr_pages == start_page) {
+			/* Merge it */
+			se->nr_pages += nr_pages;
+			return 0;
+		}
+		lh = lh->next;
+	}
+
+	/*
+	 * No merge.  Insert a new extent, preserving ordering.
+	 */
+	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
+	if (new_se == NULL)
+		return -ENOMEM;
+	new_se->start_page = start_page;
+	new_se->nr_pages = nr_pages;
+	new_se->start_block = start_block;
+
+	lh = sis->extent_list.prev;	/* The lowest block */
+	while (lh != &sis->extent_list) {
+		se = list_entry(lh, struct swap_extent, list);
+		if (se->start_block > start_block)
+			break;
+		lh = lh->prev;
+	}
+	list_add_tail(&new_se->list, lh);
+	sis->nr_extents++;
+	return 0;
+}
+
+/*
+ * A `swap extent' is a simple thing which maps a contiguous range of pages
+ * onto a contiguous range of disk blocks.  An ordered list of swap extents
+ * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * time for locating where on disk a page belongs.
+ *
+ * If the swapfile is an S_ISBLK block device, a single extent is installed.
+ * This is done so that the main operating code can treat S_ISBLK and S_ISREG
+ * swap files identically.
+ *
+ * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
+ * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
+ * swapfiles are handled *identically* after swapon time.
+ *
+ * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
+ * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
+ * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * requirements, they are simply tossed out - we will never use those blocks
+ * for swapping.
+ *
+ * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
+ * prevents root from shooting her foot off by ftruncating an in-use swapfile,
+ * which will scribble on the fs.
+ *
+ * The amount of disk space which a single swap extent represents varies.
+ * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
+ * extents in the list.  To avoid much list walking, we cache the previous
+ * search location in `curr_swap_extent', and start new searches from there.
+ * This is extremely effective.  The average number of iterations in
+ * map_swap_page() has been measured at about 0.3 per page.  - akpm.
+ */
+static int setup_swap_extents(struct swap_info_struct *sis)
+{
+	struct inode *inode;
+	unsigned blocks_per_page;
+	unsigned long page_no;
+	unsigned blkbits;
+	sector_t probe_block;
+	sector_t last_block;
+	int ret;
+
+	inode = sis->swap_file->f_mapping->host;
+	if (S_ISBLK(inode->i_mode)) {
+		ret = add_swap_extent(sis, 0, sis->max, 0);
+		goto done;
+	}
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	/*
+	 * Map all the blocks into the extent list.  This code doesn't try
+	 * to be very smart.
+	 */
+	probe_block = 0;
+	page_no = 0;
+	last_block = i_size_read(inode) >> blkbits;
+	while ((probe_block + blocks_per_page) <= last_block &&
+			page_no < sis->max) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/*
+		 * It must be PAGE_SIZE aligned on-disk
+		 */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto reprobe;
+		}
+
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto reprobe;
+			}
+		}
+
+		/*
+		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+		 */
+		ret = add_swap_extent(sis, page_no, 1,
+				first_block >> (PAGE_SHIFT - blkbits));
+		if (ret)
+			goto out;
+		page_no++;
+		probe_block += blocks_per_page;
+reprobe:
+		continue;
+	}
+	ret = 0;
+	if (page_no == 0)
+		ret = -EINVAL;
+	sis->max = page_no;
+	sis->highest_bit = page_no - 1;
+done:
+	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
+					struct swap_extent, list);
+	goto out;
+bad_bmap:
+	printk(KERN_ERR "swapon: swapfile has holes\n");
+	ret = -EINVAL;
+out:
+	return ret;
+}
+
+#if 0	/* We don't need this yet */
+#include <linux/backing-dev.h>
+int page_queue_congested(struct page *page)
+{
+	struct backing_dev_info *bdi;
+
+	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
+
+	if (PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page->private };
+		struct swap_info_struct *sis;
+
+		sis = get_swap_info_struct(swp_type(entry));
+		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
+	} else
+		bdi = page->mapping->backing_dev_info;
+	return bdi_write_congested(bdi);
+}
+#endif
+
+asmlinkage long sys_swapoff(const char __user * specialfile)
+{
+	struct swap_info_struct * p = NULL;
+	unsigned short *swap_map;
+	struct file *swap_file, *victim;
+	struct address_space *mapping;
+	struct inode *inode;
+	char * pathname;
+	int i, type, prev;
+	int err;
+	
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	pathname = getname(specialfile);
+	err = PTR_ERR(pathname);
+	if (IS_ERR(pathname))
+		goto out;
+
+	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+	putname(pathname);
+	err = PTR_ERR(victim);
+	if (IS_ERR(victim))
+		goto out;
+
+	mapping = victim->f_mapping;
+	prev = -1;
+	swap_list_lock();
+	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+		p = swap_info + type;
+		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+			if (p->swap_file->f_mapping == mapping)
+				break;
+		}
+		prev = type;
+	}
+	if (type < 0) {
+		err = -EINVAL;
+		swap_list_unlock();
+		goto out_dput;
+	}
+	if (!security_vm_enough_memory(p->pages))
+		vm_unacct_memory(p->pages);
+	else {
+		err = -ENOMEM;
+		swap_list_unlock();
+		goto out_dput;
+	}
+	if (prev < 0) {
+		swap_list.head = p->next;
+	} else {
+		swap_info[prev].next = p->next;
+	}
+	if (type == swap_list.next) {
+		/* just pick something that's safe... */
+		swap_list.next = swap_list.head;
+	}
+	nr_swap_pages -= p->pages;
+	total_swap_pages -= p->pages;
+	p->flags &= ~SWP_WRITEOK;
+	swap_list_unlock();
+	current->flags |= PF_SWAPOFF;
+	err = try_to_unuse(type);
+	current->flags &= ~PF_SWAPOFF;
+
+	/* wait for any unplug function to finish */
+	down_write(&swap_unplug_sem);
+	up_write(&swap_unplug_sem);
+
+	if (err) {
+		/* re-insert swap space back into swap_list */
+		swap_list_lock();
+		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+			if (p->prio >= swap_info[i].prio)
+				break;
+		p->next = i;
+		if (prev < 0)
+			swap_list.head = swap_list.next = p - swap_info;
+		else
+			swap_info[prev].next = p - swap_info;
+		nr_swap_pages += p->pages;
+		total_swap_pages += p->pages;
+		p->flags |= SWP_WRITEOK;
+		swap_list_unlock();
+		goto out_dput;
+	}
+	down(&swapon_sem);
+	swap_list_lock();
+	drain_mmlist();
+	swap_device_lock(p);
+	swap_file = p->swap_file;
+	p->swap_file = NULL;
+	p->max = 0;
+	swap_map = p->swap_map;
+	p->swap_map = NULL;
+	p->flags = 0;
+	destroy_swap_extents(p);
+	swap_device_unlock(p);
+	swap_list_unlock();
+	up(&swapon_sem);
+	vfree(swap_map);
+	inode = mapping->host;
+	if (S_ISBLK(inode->i_mode)) {
+		struct block_device *bdev = I_BDEV(inode);
+		set_blocksize(bdev, p->old_block_size);
+		bd_release(bdev);
+	} else {
+		down(&inode->i_sem);
+		inode->i_flags &= ~S_SWAPFILE;
+		up(&inode->i_sem);
+	}
+	filp_close(swap_file, NULL);
+	err = 0;
+
+out_dput:
+	filp_close(victim, NULL);
+out:
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS
+/* iterator */
+static void *swap_start(struct seq_file *swap, loff_t *pos)
+{
+	struct swap_info_struct *ptr = swap_info;
+	int i;
+	loff_t l = *pos;
+
+	down(&swapon_sem);
+
+	for (i = 0; i < nr_swapfiles; i++, ptr++) {
+		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+			continue;
+		if (!l--)
+			return ptr;
+	}
+
+	return NULL;
+}
+
+static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
+{
+	struct swap_info_struct *ptr = v;
+	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+
+	for (++ptr; ptr < endptr; ptr++) {
+		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+			continue;
+		++*pos;
+		return ptr;
+	}
+
+	return NULL;
+}
+
+static void swap_stop(struct seq_file *swap, void *v)
+{
+	up(&swapon_sem);
+}
+
+static int swap_show(struct seq_file *swap, void *v)
+{
+	struct swap_info_struct *ptr = v;
+	struct file *file;
+	int len;
+
+	if (v == swap_info)
+		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+
+	file = ptr->swap_file;
+	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
+		       len < 40 ? 40 - len : 1, " ",
+		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+				"partition" : "file\t",
+		       ptr->pages << (PAGE_SHIFT - 10),
+		       ptr->inuse_pages << (PAGE_SHIFT - 10),
+		       ptr->prio);
+	return 0;
+}
+
+static struct seq_operations swaps_op = {
+	.start =	swap_start,
+	.next =		swap_next,
+	.stop =		swap_stop,
+	.show =		swap_show
+};
+
+static int swaps_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &swaps_op);
+}
+
+static struct file_operations proc_swaps_operations = {
+	.open		= swaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init procswaps_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("swaps", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_swaps_operations;
+	return 0;
+}
+__initcall(procswaps_init);
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
+ *
+ * The swapon system call
+ */
+asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+{
+	struct swap_info_struct * p;
+	char *name = NULL;
+	struct block_device *bdev = NULL;
+	struct file *swap_file = NULL;
+	struct address_space *mapping;
+	unsigned int type;
+	int i, prev;
+	int error;
+	static int least_priority;
+	union swap_header *swap_header = NULL;
+	int swap_header_version;
+	int nr_good_pages = 0;
+	unsigned long maxpages = 1;
+	int swapfilesize;
+	unsigned short *swap_map;
+	struct page *page = NULL;
+	struct inode *inode = NULL;
+	int did_down = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	swap_list_lock();
+	p = swap_info;
+	for (type = 0 ; type < nr_swapfiles ; type++,p++)
+		if (!(p->flags & SWP_USED))
+			break;
+	error = -EPERM;
+	/*
+	 * Test if adding another swap device is possible. There are
+	 * two limiting factors: 1) the number of bits for the swap
+	 * type swp_entry_t definition and 2) the number of bits for
+	 * the swap type in the swap ptes as defined by the different
+	 * architectures. To honor both limitations a swap entry
+	 * with swap offset 0 and swap type ~0UL is created, encoded
+	 * to a swap pte, decoded to a swp_entry_t again and finally
+	 * the swap type part is extracted. This will mask all bits
+	 * from the initial ~0UL that can't be encoded in either the
+	 * swp_entry_t or the architecture definition of a swap pte.
+	 */
+	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+		swap_list_unlock();
+		goto out;
+	}
+	if (type >= nr_swapfiles)
+		nr_swapfiles = type+1;
+	INIT_LIST_HEAD(&p->extent_list);
+	p->flags = SWP_USED;
+	p->nr_extents = 0;
+	p->swap_file = NULL;
+	p->old_block_size = 0;
+	p->swap_map = NULL;
+	p->lowest_bit = 0;
+	p->highest_bit = 0;
+	p->cluster_nr = 0;
+	p->inuse_pages = 0;
+	spin_lock_init(&p->sdev_lock);
+	p->next = -1;
+	if (swap_flags & SWAP_FLAG_PREFER) {
+		p->prio =
+		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
+	} else {
+		p->prio = --least_priority;
+	}
+	swap_list_unlock();
+	name = getname(specialfile);
+	error = PTR_ERR(name);
+	if (IS_ERR(name)) {
+		name = NULL;
+		goto bad_swap_2;
+	}
+	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+	error = PTR_ERR(swap_file);
+	if (IS_ERR(swap_file)) {
+		swap_file = NULL;
+		goto bad_swap_2;
+	}
+
+	p->swap_file = swap_file;
+	mapping = swap_file->f_mapping;
+	inode = mapping->host;
+
+	error = -EBUSY;
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct swap_info_struct *q = &swap_info[i];
+
+		if (i == type || !q->swap_file)
+			continue;
+		if (mapping == q->swap_file->f_mapping)
+			goto bad_swap;
+	}
+
+	error = -EINVAL;
+	if (S_ISBLK(inode->i_mode)) {
+		bdev = I_BDEV(inode);
+		error = bd_claim(bdev, sys_swapon);
+		if (error < 0) {
+			bdev = NULL;
+			goto bad_swap;
+		}
+		p->old_block_size = block_size(bdev);
+		error = set_blocksize(bdev, PAGE_SIZE);
+		if (error < 0)
+			goto bad_swap;
+		p->bdev = bdev;
+	} else if (S_ISREG(inode->i_mode)) {
+		p->bdev = inode->i_sb->s_bdev;
+		down(&inode->i_sem);
+		did_down = 1;
+		if (IS_SWAPFILE(inode)) {
+			error = -EBUSY;
+			goto bad_swap;
+		}
+	} else {
+		goto bad_swap;
+	}
+
+	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+
+	/*
+	 * Read the swap header.
+	 */
+	if (!mapping->a_ops->readpage) {
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	page = read_cache_page(mapping, 0,
+			(filler_t *)mapping->a_ops->readpage, swap_file);
+	if (IS_ERR(page)) {
+		error = PTR_ERR(page);
+		goto bad_swap;
+	}
+	wait_on_page_locked(page);
+	if (!PageUptodate(page))
+		goto bad_swap;
+	kmap(page);
+	swap_header = page_address(page);
+
+	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+		swap_header_version = 1;
+	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
+		swap_header_version = 2;
+	else {
+		printk("Unable to find swap-space signature\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	
+	switch (swap_header_version) {
+	case 1:
+		printk(KERN_ERR "version 0 swap is no longer supported. "
+			"Use mkswap -v1 %s\n", name);
+		error = -EINVAL;
+		goto bad_swap;
+	case 2:
+		/* Check the swap header's sub-version and the size of
+                   the swap file and bad block lists */
+		if (swap_header->info.version != 1) {
+			printk(KERN_WARNING
+			       "Unable to handle swap header version %d\n",
+			       swap_header->info.version);
+			error = -EINVAL;
+			goto bad_swap;
+		}
+
+		p->lowest_bit  = 1;
+		/*
+		 * Find out how many pages are allowed for a single swap
+		 * device. There are two limiting factors: 1) the number of
+		 * bits for the swap offset in the swp_entry_t type and
+		 * 2) the number of bits in the a swap pte as defined by
+		 * the different architectures. In order to find the
+		 * largest possible bit mask a swap entry with swap type 0
+		 * and swap offset ~0UL is created, encoded to a swap pte,
+		 * decoded to a swp_entry_t again and finally the swap
+		 * offset is extracted. This will mask all the bits from
+		 * the initial ~0UL mask that can't be encoded in either
+		 * the swp_entry_t or the architecture definition of a
+		 * swap pte.
+		 */
+		maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
+		if (maxpages > swap_header->info.last_page)
+			maxpages = swap_header->info.last_page;
+		p->highest_bit = maxpages - 1;
+
+		error = -EINVAL;
+		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+			goto bad_swap;
+		
+		/* OK, set up the swap map and apply the bad block list */
+		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+
+		error = 0;
+		memset(p->swap_map, 0, maxpages * sizeof(short));
+		for (i=0; i<swap_header->info.nr_badpages; i++) {
+			int page = swap_header->info.badpages[i];
+			if (page <= 0 || page >= swap_header->info.last_page)
+				error = -EINVAL;
+			else
+				p->swap_map[page] = SWAP_MAP_BAD;
+		}
+		nr_good_pages = swap_header->info.last_page -
+				swap_header->info.nr_badpages -
+				1 /* header page */;
+		if (error) 
+			goto bad_swap;
+	}
+	
+	if (swapfilesize && maxpages > swapfilesize) {
+		printk(KERN_WARNING
+		       "Swap area shorter than signature indicates\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	if (!nr_good_pages) {
+		printk(KERN_WARNING "Empty swap-file\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	p->swap_map[0] = SWAP_MAP_BAD;
+	p->max = maxpages;
+	p->pages = nr_good_pages;
+
+	error = setup_swap_extents(p);
+	if (error)
+		goto bad_swap;
+
+	down(&swapon_sem);
+	swap_list_lock();
+	swap_device_lock(p);
+	p->flags = SWP_ACTIVE;
+	nr_swap_pages += nr_good_pages;
+	total_swap_pages += nr_good_pages;
+	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
+		nr_good_pages<<(PAGE_SHIFT-10), name,
+		p->prio, p->nr_extents);
+
+	/* insert swap space into swap_list: */
+	prev = -1;
+	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+		if (p->prio >= swap_info[i].prio) {
+			break;
+		}
+		prev = i;
+	}
+	p->next = i;
+	if (prev < 0) {
+		swap_list.head = swap_list.next = p - swap_info;
+	} else {
+		swap_info[prev].next = p - swap_info;
+	}
+	swap_device_unlock(p);
+	swap_list_unlock();
+	up(&swapon_sem);
+	error = 0;
+	goto out;
+bad_swap:
+	if (bdev) {
+		set_blocksize(bdev, p->old_block_size);
+		bd_release(bdev);
+	}
+bad_swap_2:
+	swap_list_lock();
+	swap_map = p->swap_map;
+	p->swap_file = NULL;
+	p->swap_map = NULL;
+	p->flags = 0;
+	if (!(swap_flags & SWAP_FLAG_PREFER))
+		++least_priority;
+	swap_list_unlock();
+	destroy_swap_extents(p);
+	vfree(swap_map);
+	if (swap_file)
+		filp_close(swap_file, NULL);
+out:
+	if (page && !IS_ERR(page)) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	if (name)
+		putname(name);
+	if (did_down) {
+		if (!error)
+			inode->i_flags |= S_SWAPFILE;
+		up(&inode->i_sem);
+	}
+	return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+	unsigned int i;
+	unsigned long nr_to_be_unused = 0;
+
+	swap_list_lock();
+	for (i = 0; i < nr_swapfiles; i++) {
+		if (!(swap_info[i].flags & SWP_USED) ||
+		     (swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		nr_to_be_unused += swap_info[i].inuse_pages;
+	}
+	val->freeswap = nr_swap_pages + nr_to_be_unused;
+	val->totalswap = total_swap_pages + nr_to_be_unused;
+	swap_list_unlock();
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
+ * "permanent", but will be reclaimed by the next swapoff.
+ */
+int swap_duplicate(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	int result = 0;
+
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		goto bad_file;
+	p = type + swap_info;
+	offset = swp_offset(entry);
+
+	swap_device_lock(p);
+	if (offset < p->max && p->swap_map[offset]) {
+		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset]++;
+			result = 1;
+		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+			if (swap_overflow++ < 5)
+				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = SWAP_MAP_MAX;
+			result = 1;
+		}
+	}
+	swap_device_unlock(p);
+out:
+	return result;
+
+bad_file:
+	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+	goto out;
+}
+
+struct swap_info_struct *
+get_swap_info_struct(unsigned type)
+{
+	return &swap_info[type];
+}
+
+/*
+ * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
+ */
+int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+	int ret = 0, i = 1 << page_cluster;
+	unsigned long toff;
+	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+
+	if (!page_cluster)	/* no readahead */
+		return 0;
+	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
+	if (!toff)		/* first page is swap header */
+		toff++, i--;
+	*offset = toff;
+
+	swap_device_lock(swapdev);
+	do {
+		/* Don't read-ahead past the end of the swap area */
+		if (toff >= swapdev->max)
+			break;
+		/* Don't read in free or bad pages */
+		if (!swapdev->swap_map[toff])
+			break;
+		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+			break;
+		toff++;
+		ret++;
+	} while (--i);
+	swap_device_unlock(swapdev);
+	return ret;
+}
diff --git a/mm/thrash.c b/mm/thrash.c
new file mode 100644
index 000000000000..11461f7ad830
--- /dev/null
+++ b/mm/thrash.c
@@ -0,0 +1,102 @@
+/*
+ * mm/thrash.c
+ *
+ * Copyright (C) 2004, Red Hat, Inc.
+ * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * Simple token based thrashing protection, using the algorithm
+ * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ */
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+
+static DEFINE_SPINLOCK(swap_token_lock);
+static unsigned long swap_token_timeout;
+static unsigned long swap_token_check;
+struct mm_struct * swap_token_mm = &init_mm;
+
+#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
+#define SWAP_TOKEN_TIMEOUT	0
+/*
+ * Currently disabled; Needs further code to work at HZ * 300.
+ */
+unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
+
+/*
+ * Take the token away if the process had no page faults
+ * in the last interval, or if it has held the token for
+ * too long.
+ */
+#define SWAP_TOKEN_ENOUGH_RSS 1
+#define SWAP_TOKEN_TIMED_OUT 2
+static int should_release_swap_token(struct mm_struct *mm)
+{
+	int ret = 0;
+	if (!mm->recent_pagein)
+		ret = SWAP_TOKEN_ENOUGH_RSS;
+	else if (time_after(jiffies, swap_token_timeout))
+		ret = SWAP_TOKEN_TIMED_OUT;
+	mm->recent_pagein = 0;
+	return ret;
+}
+
+/*
+ * Try to grab the swapout protection token.  We only try to
+ * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
+ * SMP lock contention and to check that the process that held
+ * the token before is no longer thrashing.
+ */
+void grab_swap_token(void)
+{
+	struct mm_struct *mm;
+	int reason;
+
+	/* We have the token. Let others know we still need it. */
+	if (has_swap_token(current->mm)) {
+		current->mm->recent_pagein = 1;
+		return;
+	}
+
+	if (time_after(jiffies, swap_token_check)) {
+
+		/* Can't get swapout protection if we exceed our RSS limit. */
+		// if (current->mm->rss > current->mm->rlimit_rss)
+		//	return;
+
+		/* ... or if we recently held the token. */
+		if (time_before(jiffies, current->mm->swap_token_time))
+			return;
+
+		if (!spin_trylock(&swap_token_lock))
+			return;
+
+		swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+
+		mm = swap_token_mm;
+		if ((reason = should_release_swap_token(mm))) {
+			unsigned long eligible = jiffies;
+			if (reason == SWAP_TOKEN_TIMED_OUT) {
+				eligible += swap_token_default_timeout;
+			}
+			mm->swap_token_time = eligible;
+			swap_token_timeout = jiffies + swap_token_default_timeout;
+			swap_token_mm = current->mm;
+		}
+		spin_unlock(&swap_token_lock);
+	}
+	return;
+}
+
+/* Called on process exit. */
+void __put_swap_token(struct mm_struct *mm)
+{
+	spin_lock(&swap_token_lock);
+	if (likely(mm == swap_token_mm)) {
+		swap_token_mm = &init_mm;
+		swap_token_check = jiffies;
+	}
+	spin_unlock(&swap_token_lock);
+}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
new file mode 100644
index 000000000000..c13a2161bca2
--- /dev/null
+++ b/mm/tiny-shmem.c
@@ -0,0 +1,122 @@
+/*
+ * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
+ *
+ * Matt Mackall <mpm@selenic.com> January, 2004
+ * derived from mm/shmem.c and fs/ramfs/inode.c
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/ramfs.h>
+
+static struct file_system_type tmpfs_fs_type = {
+	.name		= "tmpfs",
+	.get_sb		= ramfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static struct vfsmount *shm_mnt;
+
+static int __init init_tmpfs(void)
+{
+	register_filesystem(&tmpfs_fs_type);
+#ifdef CONFIG_TMPFS
+	devfs_mk_dir("shm");
+#endif
+	shm_mnt = kern_mount(&tmpfs_fs_type);
+	return 0;
+}
+module_init(init_tmpfs)
+
+/*
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ *
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ *
+ */
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+{
+	int error;
+	struct file *file;
+	struct inode *inode;
+	struct dentry *dentry, *root;
+	struct qstr this;
+
+	if (IS_ERR(shm_mnt))
+		return (void *)shm_mnt;
+
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0; /* will go */
+	root = shm_mnt->mnt_root;
+	dentry = d_alloc(root, &this);
+	if (!dentry)
+		goto put_memory;
+
+	error = -ENFILE;
+	file = get_empty_filp();
+	if (!file)
+		goto put_dentry;
+
+	error = -ENOSPC;
+	inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+	if (!inode)
+		goto close_file;
+
+	d_instantiate(dentry, inode);
+	inode->i_size = size;
+	inode->i_nlink = 0;	/* It is unlinked */
+	file->f_vfsmnt = mntget(shm_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_op = &ramfs_file_operations;
+	file->f_mode = FMODE_WRITE | FMODE_READ;
+	return file;
+
+close_file:
+	put_filp(file);
+put_dentry:
+	dput(dentry);
+put_memory:
+	return ERR_PTR(error);
+}
+
+/*
+ * shmem_zero_setup - setup a shared anonymous mapping
+ *
+ * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+	struct file *file;
+	loff_t size = vma->vm_end - vma->vm_start;
+
+	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = file;
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+	return 0;
+}
diff --git a/mm/truncate.c b/mm/truncate.c
new file mode 100644
index 000000000000..c9a63f0b69a2
--- /dev/null
+++ b/mm/truncate.c
@@ -0,0 +1,336 @@
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>	/* grr. try_to_release_page,
+				   block_invalidatepage */
+
+
+static int do_invalidatepage(struct page *page, unsigned long offset)
+{
+	int (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (invalidatepage == NULL)
+		invalidatepage = block_invalidatepage;
+	return (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (PagePrivate(page))
+		do_invalidatepage(page, partial);
+}
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_nopage().
+ *
+ * We need to bale out if page->mapping is no longer equal to the original
+ * mapping.  This happens a) when the VM reclaimed the page while we waited on
+ * its lock, b) when a concurrent invalidate_inode_pages got there first and
+ * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+ */
+static void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+		do_invalidatepage(page, 0);
+
+	clear_page_dirty(page);
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	remove_from_page_cache(page);
+	page_cache_release(page);	/* pagecache ref */
+}
+
+/*
+ * This is for invalidate_inode_pages().  That function can be called at
+ * any time, and is not supposed to throw away dirty pages.  But pages can
+ * be marked dirty at any time too.  So we re-check the dirtiness inside
+ * ->tree_lock.  That provides exclusion against the __set_page_dirty
+ * functions.
+ *
+ * Returns non-zero if the page was successfully invalidated.
+ */
+static int
+invalidate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return 0;
+
+	if (PagePrivate(page) && !try_to_release_page(page, 0))
+		return 0;
+
+	write_lock_irq(&mapping->tree_lock);
+	if (PageDirty(page)) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+
+	BUG_ON(PagePrivate(page));
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	ClearPageUptodate(page);
+	page_cache_release(page);	/* pagecache ref */
+	return 1;
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages that are beyond
+ * that offset (and zeroing out partial pages).
+ *
+ * Truncate takes two passes - the first pass is nonblocking.  It will not
+ * block on page locks and it will not block on writeback.  The second pass
+ * will wait.  This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * When looking at page->index outside the page lock we need to be careful to
+ * copy it into a local to avoid races (it could change at any time).
+ *
+ * We pass down the cache-hot hint to the page freeing code.  Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+
+			if (page_index > next)
+				next = page_index;
+			next++;
+			if (TestSetPageLocked(page))
+				continue;
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_complete_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (partial) {
+		struct page *page = find_lock_page(mapping, start - 1);
+		if (page) {
+			wait_on_page_writeback(page);
+			truncate_partial_page(page, partial);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	next = start;
+	for ( ; ; ) {
+		cond_resched();
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			wait_on_page_writeback(page);
+			if (page->index > next)
+				next = page->index;
+			next++;
+			truncate_complete_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+EXPORT_SYMBOL(truncate_inode_pages);
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t next = start;
+	unsigned long ret = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (next <= end &&
+			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			if (TestSetPageLocked(page)) {
+				next++;
+				continue;
+			}
+			if (page->index > next)
+				next = page->index;
+			next++;
+			if (PageDirty(page) || PageWriteback(page))
+				goto unlock;
+			if (page_mapped(page))
+				goto unlock;
+			ret += invalidate_complete_page(mapping, page);
+unlock:
+			unlock_page(page);
+			if (next > end)
+				break;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+
+unsigned long invalidate_inode_pages(struct address_space *mapping)
+{
+	return invalidate_mapping_pages(mapping, 0, ~0UL);
+}
+
+EXPORT_SYMBOL(invalidate_inode_pages);
+
+/**
+ * invalidate_inode_pages2_range - remove range of pages from an address_space
+ * @mapping - the address_space
+ * @start: the page offset 'from' which to invalidate
+ * @end: the page offset 'to' which to invalidate (inclusive)
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EIO if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2_range(struct address_space *mapping,
+				  pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+	int ret = 0;
+	int did_range_unmap = 0;
+	int wrapped = 0;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (next <= end && !ret && !wrapped &&
+		pagevec_lookup(&pvec, mapping, next,
+			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index;
+			int was_dirty;
+
+			lock_page(page);
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				continue;
+			}
+			page_index = page->index;
+			next = page_index + 1;
+			if (next == 0)
+				wrapped = 1;
+			if (page_index > end) {
+				unlock_page(page);
+				break;
+			}
+			wait_on_page_writeback(page);
+			while (page_mapped(page)) {
+				if (!did_range_unmap) {
+					/*
+					 * Zap the rest of the file in one hit.
+					 */
+					unmap_mapping_range(mapping,
+					    page_index << PAGE_CACHE_SHIFT,
+					    (end - page_index + 1)
+							<< PAGE_CACHE_SHIFT,
+					    0);
+					did_range_unmap = 1;
+				} else {
+					/*
+					 * Just zap this page
+					 */
+					unmap_mapping_range(mapping,
+					  page_index << PAGE_CACHE_SHIFT,
+					  PAGE_CACHE_SIZE, 0);
+				}
+			}
+			was_dirty = test_clear_page_dirty(page);
+			if (!invalidate_complete_page(mapping, page)) {
+				if (was_dirty)
+					set_page_dirty(page);
+				ret = -EIO;
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
+
+/**
+ * invalidate_inode_pages2 - remove all pages from an address_space
+ * @mapping - the address_space
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EIO if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2(struct address_space *mapping)
+{
+	return invalidate_inode_pages2_range(mapping, 0, -1);
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
new file mode 100644
index 000000000000..c6182f6f1305
--- /dev/null
+++ b/mm/vmalloc.c
@@ -0,0 +1,588 @@
+/*
+ *  linux/mm/vmalloc.c
+ *
+ *  Copyright (C) 1993  Linus Torvalds
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
+ *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+
+
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
+						unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		vunmap_pte_range(pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
+						unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		vunmap_pmd_range(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+void unmap_vm_area(struct vm_struct *area)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long addr = (unsigned long) area->addr;
+	unsigned long end = addr + area->size;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_k(addr);
+	flush_cache_vunmap(addr, end);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		vunmap_pud_range(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+	flush_tlb_kernel_range((unsigned long) area->addr, end);
+}
+
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pte_t *pte;
+
+	pte = pte_alloc_kernel(&init_mm, pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		struct page *page = **pages;
+		WARN_ON(!pte_none(*pte));
+		if (!page)
+			return -ENOMEM;
+		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
+		(*pages)++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	return 0;
+}
+
+static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc(&init_mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (vmap_pte_range(pmd, addr, next, prot, pages))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc(&init_mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (vmap_pmd_range(pud, addr, next, prot, pages))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long addr = (unsigned long) area->addr;
+	unsigned long end = addr + area->size - PAGE_SIZE;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_k(addr);
+	spin_lock(&init_mm.page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_pud_range(pgd, addr, next, prot, pages);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&init_mm.page_table_lock);
+	flush_cache_vmap((unsigned long) area->addr, end);
+	return err;
+}
+
+#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
+
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+				unsigned long start, unsigned long end)
+{
+	struct vm_struct **p, *tmp, *area;
+	unsigned long align = 1;
+	unsigned long addr;
+
+	if (flags & VM_IOREMAP) {
+		int bit = fls(size);
+
+		if (bit > IOREMAP_MAX_ORDER)
+			bit = IOREMAP_MAX_ORDER;
+		else if (bit < PAGE_SHIFT)
+			bit = PAGE_SHIFT;
+
+		align = 1ul << bit;
+	}
+	addr = ALIGN(start, align);
+	size = PAGE_ALIGN(size);
+
+	area = kmalloc(sizeof(*area), GFP_KERNEL);
+	if (unlikely(!area))
+		return NULL;
+
+	if (unlikely(!size)) {
+		kfree (area);
+		return NULL;
+	}
+
+	/*
+	 * We always allocate a guard page.
+	 */
+	size += PAGE_SIZE;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
+		if ((unsigned long)tmp->addr < addr) {
+			if((unsigned long)tmp->addr + tmp->size >= addr)
+				addr = ALIGN(tmp->size + 
+					     (unsigned long)tmp->addr, align);
+			continue;
+		}
+		if ((size + addr) < addr)
+			goto out;
+		if (size + addr <= (unsigned long)tmp->addr)
+			goto found;
+		addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
+		if (addr > end - size)
+			goto out;
+	}
+
+found:
+	area->next = *p;
+	*p = area;
+
+	area->flags = flags;
+	area->addr = (void *)addr;
+	area->size = size;
+	area->pages = NULL;
+	area->nr_pages = 0;
+	area->phys_addr = 0;
+	write_unlock(&vmlist_lock);
+
+	return area;
+
+out:
+	write_unlock(&vmlist_lock);
+	kfree(area);
+	if (printk_ratelimit())
+		printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
+	return NULL;
+}
+
+/**
+ *	get_vm_area  -  reserve a contingous kernel virtual area
+ *
+ *	@size:		size of the area
+ *	@flags:		%VM_IOREMAP for I/O mappings or VM_ALLOC
+ *
+ *	Search an area of @size in the kernel virtual mapping area,
+ *	and reserved it for out purposes.  Returns the area descriptor
+ *	on success or %NULL on failure.
+ */
+struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+{
+	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+}
+
+/**
+ *	remove_vm_area  -  find and remove a contingous kernel virtual area
+ *
+ *	@addr:		base address
+ *
+ *	Search for the kernel VM area starting at @addr, and remove it.
+ *	This function returns the found VM area, but using it is NOT safe
+ *	on SMP machines.
+ */
+struct vm_struct *remove_vm_area(void *addr)
+{
+	struct vm_struct **p, *tmp;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
+		 if (tmp->addr == addr)
+			 goto found;
+	}
+	write_unlock(&vmlist_lock);
+	return NULL;
+
+found:
+	unmap_vm_area(tmp);
+	*p = tmp->next;
+	write_unlock(&vmlist_lock);
+
+	/*
+	 * Remove the guard page.
+	 */
+	tmp->size -= PAGE_SIZE;
+	return tmp;
+}
+
+void __vunmap(void *addr, int deallocate_pages)
+{
+	struct vm_struct *area;
+
+	if (!addr)
+		return;
+
+	if ((PAGE_SIZE-1) & (unsigned long)addr) {
+		printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+		WARN_ON(1);
+		return;
+	}
+
+	area = remove_vm_area(addr);
+	if (unlikely(!area)) {
+		printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+				addr);
+		WARN_ON(1);
+		return;
+	}
+
+	if (deallocate_pages) {
+		int i;
+
+		for (i = 0; i < area->nr_pages; i++) {
+			if (unlikely(!area->pages[i]))
+				BUG();
+			__free_page(area->pages[i]);
+		}
+
+		if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
+			vfree(area->pages);
+		else
+			kfree(area->pages);
+	}
+
+	kfree(area);
+	return;
+}
+
+/**
+ *	vfree  -  release memory allocated by vmalloc()
+ *
+ *	@addr:		memory base address
+ *
+ *	Free the virtually contiguous memory area starting at @addr, as
+ *	obtained from vmalloc(), vmalloc_32() or __vmalloc().
+ *
+ *	May not be called in interrupt context.
+ */
+void vfree(void *addr)
+{
+	BUG_ON(in_interrupt());
+	__vunmap(addr, 1);
+}
+
+EXPORT_SYMBOL(vfree);
+
+/**
+ *	vunmap  -  release virtual mapping obtained by vmap()
+ *
+ *	@addr:		memory base address
+ *
+ *	Free the virtually contiguous memory area starting at @addr,
+ *	which was created from the page array passed to vmap().
+ *
+ *	May not be called in interrupt context.
+ */
+void vunmap(void *addr)
+{
+	BUG_ON(in_interrupt());
+	__vunmap(addr, 0);
+}
+
+EXPORT_SYMBOL(vunmap);
+
+/**
+ *	vmap  -  map an array of pages into virtually contiguous space
+ *
+ *	@pages:		array of page pointers
+ *	@count:		number of pages to map
+ *	@flags:		vm_area->flags
+ *	@prot:		page protection for the mapping
+ *
+ *	Maps @count pages from @pages into contiguous kernel virtual
+ *	space.
+ */
+void *vmap(struct page **pages, unsigned int count,
+		unsigned long flags, pgprot_t prot)
+{
+	struct vm_struct *area;
+
+	if (count > num_physpages)
+		return NULL;
+
+	area = get_vm_area((count << PAGE_SHIFT), flags);
+	if (!area)
+		return NULL;
+	if (map_vm_area(area, prot, &pages)) {
+		vunmap(area->addr);
+		return NULL;
+	}
+
+	return area->addr;
+}
+
+EXPORT_SYMBOL(vmap);
+
+void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
+{
+	struct page **pages;
+	unsigned int nr_pages, array_size, i;
+
+	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	array_size = (nr_pages * sizeof(struct page *));
+
+	area->nr_pages = nr_pages;
+	/* Please note that the recursion is strictly bounded. */
+	if (array_size > PAGE_SIZE)
+		pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+	else
+		pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+	area->pages = pages;
+	if (!area->pages) {
+		remove_vm_area(area->addr);
+		kfree(area);
+		return NULL;
+	}
+	memset(area->pages, 0, array_size);
+
+	for (i = 0; i < area->nr_pages; i++) {
+		area->pages[i] = alloc_page(gfp_mask);
+		if (unlikely(!area->pages[i])) {
+			/* Successfully allocated i pages, free them in __vunmap() */
+			area->nr_pages = i;
+			goto fail;
+		}
+	}
+
+	if (map_vm_area(area, prot, &pages))
+		goto fail;
+	return area->addr;
+
+fail:
+	vfree(area->addr);
+	return NULL;
+}
+
+/**
+ *	__vmalloc  -  allocate virtually contiguous memory
+ *
+ *	@size:		allocation size
+ *	@gfp_mask:	flags for the page level allocator
+ *	@prot:		protection mask for the allocated pages
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator with @gfp_mask flags.  Map them into contiguous
+ *	kernel virtual space, using a pagetable protection of @prot.
+ */
+void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
+{
+	struct vm_struct *area;
+
+	size = PAGE_ALIGN(size);
+	if (!size || (size >> PAGE_SHIFT) > num_physpages)
+		return NULL;
+
+	area = get_vm_area(size, VM_ALLOC);
+	if (!area)
+		return NULL;
+
+	return __vmalloc_area(area, gfp_mask, prot);
+}
+
+EXPORT_SYMBOL(__vmalloc);
+
+/**
+ *	vmalloc  -  allocate virtually contiguous memory
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into contiguous kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+EXPORT_SYMBOL(vmalloc);
+
+/**
+ *	vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *
+ *	@size:		allocation size
+ *
+ *	Kernel-internal function to allocate enough pages to cover @size
+ *	the page level allocator and map them into contiguous and
+ *	executable kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+void *vmalloc_exec(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
+/**
+ *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough 32bit PA addressable pages to cover @size from the
+ *	page level allocator and map them into contiguous kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+
+EXPORT_SYMBOL(vmalloc_32);
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+	struct vm_struct *tmp;
+	char *vaddr, *buf_start = buf;
+	unsigned long n;
+
+	/* Don't allow overflow */
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
+
+	read_lock(&vmlist_lock);
+	for (tmp = vmlist; tmp; tmp = tmp->next) {
+		vaddr = (char *) tmp->addr;
+		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+			continue;
+		while (addr < vaddr) {
+			if (count == 0)
+				goto finished;
+			*buf = '\0';
+			buf++;
+			addr++;
+			count--;
+		}
+		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		do {
+			if (count == 0)
+				goto finished;
+			*buf = *addr;
+			buf++;
+			addr++;
+			count--;
+		} while (--n > 0);
+	}
+finished:
+	read_unlock(&vmlist_lock);
+	return buf - buf_start;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+	struct vm_struct *tmp;
+	char *vaddr, *buf_start = buf;
+	unsigned long n;
+
+	/* Don't allow overflow */
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
+
+	read_lock(&vmlist_lock);
+	for (tmp = vmlist; tmp; tmp = tmp->next) {
+		vaddr = (char *) tmp->addr;
+		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+			continue;
+		while (addr < vaddr) {
+			if (count == 0)
+				goto finished;
+			buf++;
+			addr++;
+			count--;
+		}
+		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		do {
+			if (count == 0)
+				goto finished;
+			*addr = *buf;
+			buf++;
+			addr++;
+			count--;
+		} while (--n > 0);
+	}
+finished:
+	read_unlock(&vmlist_lock);
+	return buf - buf_start;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
new file mode 100644
index 000000000000..4003c0518d28
--- /dev/null
+++ b/mm/vmscan.c
@@ -0,0 +1,1311 @@
+/*
+ *  linux/mm/vmscan.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Swap reorganised 29.12.95, Stephen Tweedie.
+ *  kswapd added: 7.1.96  sct
+ *  Removed kswapd_ctl limits, and swap out as many pages as needed
+ *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
+ *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ *  Multiqueue VM started 5.8.00, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+
+#include <linux/swapops.h>
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+struct scan_control {
+	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
+	unsigned long nr_to_scan;
+
+	/* Incremented by the number of inactive pages that were scanned */
+	unsigned long nr_scanned;
+
+	/* Incremented by the number of pages reclaimed */
+	unsigned long nr_reclaimed;
+
+	unsigned long nr_mapped;	/* From page_state */
+
+	/* How many pages shrink_cache() should reclaim */
+	int nr_to_reclaim;
+
+	/* Ask shrink_caches, or shrink_zone to scan at this priority */
+	unsigned int priority;
+
+	/* This context's GFP mask */
+	unsigned int gfp_mask;
+
+	int may_writepage;
+
+	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
+	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+	 * In this context, it doesn't matter that we scan the
+	 * whole list at once. */
+	int swap_cluster_max;
+};
+
+/*
+ * The list of shrinker callbacks used by to apply pressure to
+ * ageable caches.
+ */
+struct shrinker {
+	shrinker_t		shrinker;
+	struct list_head	list;
+	int			seeks;	/* seeks to recreate an obj */
+	long			nr;	/* objs pending delete */
+};
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetch(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+									\
+			prev = lru_to_page(&(_page->lru));		\
+			prefetchw(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * From 0 .. 100.  Higher means more swappy.
+ */
+int vm_swappiness = 60;
+static long total_memory;
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_RWSEM(shrinker_rwsem);
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+{
+        struct shrinker *shrinker;
+
+        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+        if (shrinker) {
+	        shrinker->shrinker = theshrinker;
+	        shrinker->seeks = seeks;
+	        shrinker->nr = 0;
+	        down_write(&shrinker_rwsem);
+	        list_add_tail(&shrinker->list, &shrinker_list);
+	        up_write(&shrinker_rwsem);
+	}
+	return shrinker;
+}
+EXPORT_SYMBOL(set_shrinker);
+
+/*
+ * Remove one
+ */
+void remove_shrinker(struct shrinker *shrinker)
+{
+	down_write(&shrinker_rwsem);
+	list_del(&shrinker->list);
+	up_write(&shrinker_rwsem);
+	kfree(shrinker);
+}
+EXPORT_SYMBOL(remove_shrinker);
+
+#define SHRINK_BATCH 128
+/*
+ * Call the shrink functions to age shrinkable caches
+ *
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object.  With this in mind we age equal
+ * percentages of the lru and ageable caches.  This should balance the seeks
+ * generated by these structures.
+ *
+ * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * slab to avoid swapping.
+ *
+ * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ *
+ * `lru_pages' represents the number of on-LRU pages in all the zones which
+ * are eligible for the caller's allocation attempt.  It is used for balancing
+ * slab reclaim versus page reclaim.
+ */
+static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
+			unsigned long lru_pages)
+{
+	struct shrinker *shrinker;
+
+	if (scanned == 0)
+		scanned = SWAP_CLUSTER_MAX;
+
+	if (!down_read_trylock(&shrinker_rwsem))
+		return 0;
+
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		unsigned long long delta;
+		unsigned long total_scan;
+
+		delta = (4 * scanned) / shrinker->seeks;
+		delta *= (*shrinker->shrinker)(0, gfp_mask);
+		do_div(delta, lru_pages + 1);
+		shrinker->nr += delta;
+		if (shrinker->nr < 0)
+			shrinker->nr = LONG_MAX;	/* It wrapped! */
+
+		total_scan = shrinker->nr;
+		shrinker->nr = 0;
+
+		while (total_scan >= SHRINK_BATCH) {
+			long this_scan = SHRINK_BATCH;
+			int shrink_ret;
+
+			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			if (shrink_ret == -1)
+				break;
+			mod_page_state(slabs_scanned, this_scan);
+			total_scan -= this_scan;
+
+			cond_resched();
+		}
+
+		shrinker->nr += total_scan;
+	}
+	up_read(&shrinker_rwsem);
+	return 0;
+}
+
+/* Called without lock on whether page is mapped, so answer is unstable */
+static inline int page_mapping_inuse(struct page *page)
+{
+	struct address_space *mapping;
+
+	/* Page is in somebody's page tables. */
+	if (page_mapped(page))
+		return 1;
+
+	/* Be more reluctant to reclaim swapcache than pagecache */
+	if (PageSwapCache(page))
+		return 1;
+
+	mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+
+	/* File is mmap'd by somebody? */
+	return mapping_mapped(mapping);
+}
+
+static inline int is_page_cache_freeable(struct page *page)
+{
+	return page_count(page) - !!PagePrivate(page) == 2;
+}
+
+static int may_write_to_queue(struct backing_dev_info *bdi)
+{
+	if (current_is_kswapd())
+		return 1;
+	if (current_is_pdflush())	/* This is unlikely, but why not... */
+		return 1;
+	if (!bdi_write_congested(bdi))
+		return 1;
+	if (bdi == current->backing_dev_info)
+		return 1;
+	return 0;
+}
+
+/*
+ * We detected a synchronous write error writing a page out.  Probably
+ * -ENOSPC.  We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up.  But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+				struct page *page, int error)
+{
+	lock_page(page);
+	if (page_mapping(page) == mapping) {
+		if (error == -ENOSPC)
+			set_bit(AS_ENOSPC, &mapping->flags);
+		else
+			set_bit(AS_EIO, &mapping->flags);
+	}
+	unlock_page(page);
+}
+
+/*
+ * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page, struct address_space *mapping)
+{
+	/*
+	 * If the page is dirty, only perform writeback if that write
+	 * will be non-blocking.  To prevent this allocation from being
+	 * stalled by pagecache activity.  But note that there may be
+	 * stalls if we need to run get_block().  We could test
+	 * PagePrivate for that.
+	 *
+	 * If this process is currently in generic_file_write() against
+	 * this page's queue, we can perform writeback even if that
+	 * will block.
+	 *
+	 * If the page is swapcache, write it back even if that would
+	 * block, for some throttling. This happens by accident, because
+	 * swap_backing_dev_info is bust: it doesn't reflect the
+	 * congestion state of the swapdevs.  Easy to fix, if needed.
+	 * See swapfile.c:page_queue_congested().
+	 */
+	if (!is_page_cache_freeable(page))
+		return PAGE_KEEP;
+	if (!mapping) {
+		/*
+		 * Some data journaling orphaned pages can have
+		 * page->mapping == NULL while being dirty with clean buffers.
+		 */
+		if (PageDirty(page) && PagePrivate(page)) {
+			if (try_to_free_buffers(page)) {
+				ClearPageDirty(page);
+				printk("%s: orphaned page\n", __FUNCTION__);
+				return PAGE_CLEAN;
+			}
+		}
+		return PAGE_KEEP;
+	}
+	if (mapping->a_ops->writepage == NULL)
+		return PAGE_ACTIVATE;
+	if (!may_write_to_queue(mapping->backing_dev_info))
+		return PAGE_KEEP;
+
+	if (clear_page_dirty_for_io(page)) {
+		int res;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+			.nr_to_write = SWAP_CLUSTER_MAX,
+			.nonblocking = 1,
+			.for_reclaim = 1,
+		};
+
+		SetPageReclaim(page);
+		res = mapping->a_ops->writepage(page, &wbc);
+		if (res < 0)
+			handle_write_error(mapping, page, res);
+		if (res == WRITEPAGE_ACTIVATE) {
+			ClearPageReclaim(page);
+			return PAGE_ACTIVATE;
+		}
+		if (!PageWriteback(page)) {
+			/* synchronous write or broken a_ops? */
+			ClearPageReclaim(page);
+		}
+
+		return PAGE_SUCCESS;
+	}
+
+	return PAGE_CLEAN;
+}
+
+/*
+ * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ */
+static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+{
+	LIST_HEAD(ret_pages);
+	struct pagevec freed_pvec;
+	int pgactivate = 0;
+	int reclaimed = 0;
+
+	cond_resched();
+
+	pagevec_init(&freed_pvec, 1);
+	while (!list_empty(page_list)) {
+		struct address_space *mapping;
+		struct page *page;
+		int may_enter_fs;
+		int referenced;
+
+		cond_resched();
+
+		page = lru_to_page(page_list);
+		list_del(&page->lru);
+
+		if (TestSetPageLocked(page))
+			goto keep;
+
+		BUG_ON(PageActive(page));
+
+		sc->nr_scanned++;
+		/* Double the slab pressure for mapped and swapcache pages */
+		if (page_mapped(page) || PageSwapCache(page))
+			sc->nr_scanned++;
+
+		if (PageWriteback(page))
+			goto keep_locked;
+
+		referenced = page_referenced(page, 1, sc->priority <= 0);
+		/* In active use or really unfreeable?  Activate it. */
+		if (referenced && page_mapping_inuse(page))
+			goto activate_locked;
+
+#ifdef CONFIG_SWAP
+		/*
+		 * Anonymous process memory has backing store?
+		 * Try to allocate it some swap space here.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page))
+				goto activate_locked;
+		}
+#endif /* CONFIG_SWAP */
+
+		mapping = page_mapping(page);
+		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+		/*
+		 * The page is mapped into the page tables of one or more
+		 * processes. Try to unmap it here.
+		 */
+		if (page_mapped(page) && mapping) {
+			switch (try_to_unmap(page)) {
+			case SWAP_FAIL:
+				goto activate_locked;
+			case SWAP_AGAIN:
+				goto keep_locked;
+			case SWAP_SUCCESS:
+				; /* try to free the page below */
+			}
+		}
+
+		if (PageDirty(page)) {
+			if (referenced)
+				goto keep_locked;
+			if (!may_enter_fs)
+				goto keep_locked;
+			if (laptop_mode && !sc->may_writepage)
+				goto keep_locked;
+
+			/* Page is dirty, try to write it out here */
+			switch(pageout(page, mapping)) {
+			case PAGE_KEEP:
+				goto keep_locked;
+			case PAGE_ACTIVATE:
+				goto activate_locked;
+			case PAGE_SUCCESS:
+				if (PageWriteback(page) || PageDirty(page))
+					goto keep;
+				/*
+				 * A synchronous write - probably a ramdisk.  Go
+				 * ahead and try to reclaim the page.
+				 */
+				if (TestSetPageLocked(page))
+					goto keep;
+				if (PageDirty(page) || PageWriteback(page))
+					goto keep_locked;
+				mapping = page_mapping(page);
+			case PAGE_CLEAN:
+				; /* try to free the page below */
+			}
+		}
+
+		/*
+		 * If the page has buffers, try to free the buffer mappings
+		 * associated with this page. If we succeed we try to free
+		 * the page as well.
+		 *
+		 * We do this even if the page is PageDirty().
+		 * try_to_release_page() does not perform I/O, but it is
+		 * possible for a page to have PageDirty set, but it is actually
+		 * clean (all its buffers are clean).  This happens if the
+		 * buffers were written out directly, with submit_bh(). ext3
+		 * will do this, as well as the blockdev mapping. 
+		 * try_to_release_page() will discover that cleanness and will
+		 * drop the buffers and mark the page clean - it can be freed.
+		 *
+		 * Rarely, pages can have buffers and no ->mapping.  These are
+		 * the pages which were not successfully invalidated in
+		 * truncate_complete_page().  We try to drop those buffers here
+		 * and if that worked, and the page is no longer mapped into
+		 * process address space (page_count == 1) it can be freed.
+		 * Otherwise, leave the page on the LRU so it is swappable.
+		 */
+		if (PagePrivate(page)) {
+			if (!try_to_release_page(page, sc->gfp_mask))
+				goto activate_locked;
+			if (!mapping && page_count(page) == 1)
+				goto free_it;
+		}
+
+		if (!mapping)
+			goto keep_locked;	/* truncate got there first */
+
+		write_lock_irq(&mapping->tree_lock);
+
+		/*
+		 * The non-racy check for busy page.  It is critical to check
+		 * PageDirty _after_ making sure that the page is freeable and
+		 * not in use by anybody. 	(pagecache + us == 2)
+		 */
+		if (page_count(page) != 2 || PageDirty(page)) {
+			write_unlock_irq(&mapping->tree_lock);
+			goto keep_locked;
+		}
+
+#ifdef CONFIG_SWAP
+		if (PageSwapCache(page)) {
+			swp_entry_t swap = { .val = page->private };
+			__delete_from_swap_cache(page);
+			write_unlock_irq(&mapping->tree_lock);
+			swap_free(swap);
+			__put_page(page);	/* The pagecache ref */
+			goto free_it;
+		}
+#endif /* CONFIG_SWAP */
+
+		__remove_from_page_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		__put_page(page);
+
+free_it:
+		unlock_page(page);
+		reclaimed++;
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+
+activate_locked:
+		SetPageActive(page);
+		pgactivate++;
+keep_locked:
+		unlock_page(page);
+keep:
+		list_add(&page->lru, &ret_pages);
+		BUG_ON(PageLRU(page));
+	}
+	list_splice(&ret_pages, page_list);
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+	mod_page_state(pgactivate, pgactivate);
+	sc->nr_reclaimed += reclaimed;
+	return reclaimed;
+}
+
+/*
+ * zone->lru_lock is heavily contended.  Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan:	The number of pages to look through on the list.
+ * @src:	The LRU list to pull pages off.
+ * @dst:	The temp list to put pages on to.
+ * @scanned:	The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+			     struct list_head *dst, int *scanned)
+{
+	int nr_taken = 0;
+	struct page *page;
+	int scan = 0;
+
+	while (scan++ < nr_to_scan && !list_empty(src)) {
+		page = lru_to_page(src);
+		prefetchw_prev_lru_page(page, src, flags);
+
+		if (!TestClearPageLRU(page))
+			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
+		}
+	}
+
+	*scanned = scan;
+	return nr_taken;
+}
+
+/*
+ * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ */
+static void shrink_cache(struct zone *zone, struct scan_control *sc)
+{
+	LIST_HEAD(page_list);
+	struct pagevec pvec;
+	int max_scan = sc->nr_to_scan;
+
+	pagevec_init(&pvec, 1);
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	while (max_scan > 0) {
+		struct page *page;
+		int nr_taken;
+		int nr_scan;
+		int nr_freed;
+
+		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+					     &zone->inactive_list,
+					     &page_list, &nr_scan);
+		zone->nr_inactive -= nr_taken;
+		zone->pages_scanned += nr_scan;
+		spin_unlock_irq(&zone->lru_lock);
+
+		if (nr_taken == 0)
+			goto done;
+
+		max_scan -= nr_scan;
+		if (current_is_kswapd())
+			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+		else
+			mod_page_state_zone(zone, pgscan_direct, nr_scan);
+		nr_freed = shrink_list(&page_list, sc);
+		if (current_is_kswapd())
+			mod_page_state(kswapd_steal, nr_freed);
+		mod_page_state_zone(zone, pgsteal, nr_freed);
+		sc->nr_to_reclaim -= nr_freed;
+
+		spin_lock_irq(&zone->lru_lock);
+		/*
+		 * Put back any unfreeable pages.
+		 */
+		while (!list_empty(&page_list)) {
+			page = lru_to_page(&page_list);
+			if (TestSetPageLRU(page))
+				BUG();
+			list_del(&page->lru);
+			if (PageActive(page))
+				add_page_to_active_list(zone, page);
+			else
+				add_page_to_inactive_list(zone, page);
+			if (!pagevec_add(&pvec, page)) {
+				spin_unlock_irq(&zone->lru_lock);
+				__pagevec_release(&pvec);
+				spin_lock_irq(&zone->lru_lock);
+			}
+		}
+  	}
+	spin_unlock_irq(&zone->lru_lock);
+done:
+	pagevec_release(&pvec);
+}
+
+/*
+ * This moves pages from the active list to the inactive list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone->lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_count against each page.
+ * But we had to alter page->flags anyway.
+ */
+static void
+refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+{
+	int pgmoved;
+	int pgdeactivate = 0;
+	int pgscanned;
+	int nr_pages = sc->nr_to_scan;
+	LIST_HEAD(l_hold);	/* The pages which were snipped off */
+	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
+	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	struct page *page;
+	struct pagevec pvec;
+	int reclaim_mapped = 0;
+	long mapped_ratio;
+	long distress;
+	long swap_tendency;
+
+	lru_add_drain();
+	spin_lock_irq(&zone->lru_lock);
+	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+				    &l_hold, &pgscanned);
+	zone->pages_scanned += pgscanned;
+	zone->nr_active -= pgmoved;
+	spin_unlock_irq(&zone->lru_lock);
+
+	/*
+	 * `distress' is a measure of how much trouble we're having reclaiming
+	 * pages.  0 -> no problems.  100 -> great trouble.
+	 */
+	distress = 100 >> zone->prev_priority;
+
+	/*
+	 * The point of this algorithm is to decide when to start reclaiming
+	 * mapped memory instead of just pagecache.  Work out how much memory
+	 * is mapped.
+	 */
+	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+	/*
+	 * Now decide how much we really want to unmap some pages.  The mapped
+	 * ratio is downgraded - just because there's a lot of mapped memory
+	 * doesn't necessarily mean that page reclaim isn't succeeding.
+	 *
+	 * The distress ratio is important - we don't want to start going oom.
+	 *
+	 * A 100% value of vm_swappiness overrides this algorithm altogether.
+	 */
+	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+	/*
+	 * Now use this metric to decide whether to start moving mapped memory
+	 * onto the inactive list.
+	 */
+	if (swap_tendency >= 100)
+		reclaim_mapped = 1;
+
+	while (!list_empty(&l_hold)) {
+		cond_resched();
+		page = lru_to_page(&l_hold);
+		list_del(&page->lru);
+		if (page_mapped(page)) {
+			if (!reclaim_mapped ||
+			    (total_swap_pages == 0 && PageAnon(page)) ||
+			    page_referenced(page, 0, sc->priority <= 0)) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
+		}
+		list_add(&page->lru, &l_inactive);
+	}
+
+	pagevec_init(&pvec, 1);
+	pgmoved = 0;
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(&l_inactive)) {
+		page = lru_to_page(&l_inactive);
+		prefetchw_prev_lru_page(page, &l_inactive, flags);
+		if (TestSetPageLRU(page))
+			BUG();
+		if (!TestClearPageActive(page))
+			BUG();
+		list_move(&page->lru, &zone->inactive_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_inactive += pgmoved;
+			spin_unlock_irq(&zone->lru_lock);
+			pgdeactivate += pgmoved;
+			pgmoved = 0;
+			if (buffer_heads_over_limit)
+				pagevec_strip(&pvec);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_inactive += pgmoved;
+	pgdeactivate += pgmoved;
+	if (buffer_heads_over_limit) {
+		spin_unlock_irq(&zone->lru_lock);
+		pagevec_strip(&pvec);
+		spin_lock_irq(&zone->lru_lock);
+	}
+
+	pgmoved = 0;
+	while (!list_empty(&l_active)) {
+		page = lru_to_page(&l_active);
+		prefetchw_prev_lru_page(page, &l_active, flags);
+		if (TestSetPageLRU(page))
+			BUG();
+		BUG_ON(!PageActive(page));
+		list_move(&page->lru, &zone->active_list);
+		pgmoved++;
+		if (!pagevec_add(&pvec, page)) {
+			zone->nr_active += pgmoved;
+			pgmoved = 0;
+			spin_unlock_irq(&zone->lru_lock);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&zone->lru_lock);
+		}
+	}
+	zone->nr_active += pgmoved;
+	spin_unlock_irq(&zone->lru_lock);
+	pagevec_release(&pvec);
+
+	mod_page_state_zone(zone, pgrefill, pgscanned);
+	mod_page_state(pgdeactivate, pgdeactivate);
+}
+
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static void
+shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+
+	/*
+	 * Add one to `nr_to_scan' just to make sure that the kernel will
+	 * slowly sift through the active list.
+	 */
+	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+	nr_active = zone->nr_scan_active;
+	if (nr_active >= sc->swap_cluster_max)
+		zone->nr_scan_active = 0;
+	else
+		nr_active = 0;
+
+	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+	nr_inactive = zone->nr_scan_inactive;
+	if (nr_inactive >= sc->swap_cluster_max)
+		zone->nr_scan_inactive = 0;
+	else
+		nr_inactive = 0;
+
+	sc->nr_to_reclaim = sc->swap_cluster_max;
+
+	while (nr_active || nr_inactive) {
+		if (nr_active) {
+			sc->nr_to_scan = min(nr_active,
+					(unsigned long)sc->swap_cluster_max);
+			nr_active -= sc->nr_to_scan;
+			refill_inactive_zone(zone, sc);
+		}
+
+		if (nr_inactive) {
+			sc->nr_to_scan = min(nr_inactive,
+					(unsigned long)sc->swap_cluster_max);
+			nr_inactive -= sc->nr_to_scan;
+			shrink_cache(zone, sc);
+			if (sc->nr_to_reclaim <= 0)
+				break;
+		}
+	}
+
+	throttle_vm_writeout();
+}
+
+/*
+ * This is the direct reclaim path, for page-allocating processes.  We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ *
+ * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * a) The caller may be trying to free *extra* pages to satisfy a higher-order
+ *    allocation or
+ * b) The zones may be over pages_high but they must go *over* pages_high to
+ *    satisfy the `incremental min' zone defense algorithm.
+ *
+ * Returns the number of reclaimed pages.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
+ */
+static void
+shrink_caches(struct zone **zones, struct scan_control *sc)
+{
+	int i;
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (zone->present_pages == 0)
+			continue;
+
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
+		zone->temp_priority = sc->priority;
+		if (zone->prev_priority > sc->priority)
+			zone->prev_priority = sc->priority;
+
+		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+			continue;	/* Let kswapd poll it */
+
+		shrink_zone(zone, sc);
+	}
+}
+ 
+/*
+ * This is the main entry point to direct page reclaim.
+ *
+ * If a full scan of the inactive list fails to free enough memory then we
+ * are "out of memory" and something needs to be killed.
+ *
+ * If the caller is !__GFP_FS then the probability of a failure is reasonably
+ * high - the zone may be full of dirty or under-writeback pages, which this
+ * caller can't do much about.  We kick pdflush and take explicit naps in the
+ * hope that some of these pages can be written.  But if the allocating task
+ * holds filesystem locks which prevent writeout this might not work, and the
+ * allocation attempt will fail.
+ */
+int try_to_free_pages(struct zone **zones,
+		unsigned int gfp_mask, unsigned int order)
+{
+	int priority;
+	int ret = 0;
+	int total_scanned = 0, total_reclaimed = 0;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	struct scan_control sc;
+	unsigned long lru_pages = 0;
+	int i;
+
+	sc.gfp_mask = gfp_mask;
+	sc.may_writepage = 0;
+
+	inc_page_state(allocstall);
+
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
+		zone->temp_priority = DEF_PRIORITY;
+		lru_pages += zone->nr_active + zone->nr_inactive;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		sc.nr_mapped = read_page_state(nr_mapped);
+		sc.nr_scanned = 0;
+		sc.nr_reclaimed = 0;
+		sc.priority = priority;
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+		shrink_caches(zones, &sc);
+		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		if (reclaim_state) {
+			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
+		total_scanned += sc.nr_scanned;
+		total_reclaimed += sc.nr_reclaimed;
+		if (total_reclaimed >= sc.swap_cluster_max) {
+			ret = 1;
+			goto out;
+		}
+
+		/*
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
+		 */
+		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+			sc.may_writepage = 1;
+		}
+
+		/* Take a nap, wait for some writeback to complete */
+		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+	}
+out:
+	for (i = 0; zones[i] != 0; i++) {
+		struct zone *zone = zones[i];
+
+		if (!cpuset_zone_allowed(zone))
+			continue;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	return ret;
+}
+
+/*
+ * For kswapd, balance_pgdat() will work across all this node's zones until
+ * they are all at pages_high.
+ *
+ * If `nr_pages' is non-zero then it is the number of pages which are to be
+ * reclaimed, regardless of the zone occupancies.  This is a software suspend
+ * special.
+ *
+ * Returns the number of pages which were actually freed.
+ *
+ * There is special handling here for zones which are full of pinned pages.
+ * This can happen if the pages are all mlocked, or if they are all used by
+ * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
+ * What we do is to detect the case where all pages in the zone have been
+ * scanned twice and there has been zero successful reclaim.  Mark the zone as
+ * dead and from now on, only perform a short scan.  Basically we're polling
+ * the zone for when the problem goes away.
+ *
+ * kswapd scans the zones in the highmem->normal->dma direction.  It skips
+ * zones which have free_pages > pages_high, but once a zone is found to have
+ * free_pages <= pages_high, we scan that zone and the lower zones regardless
+ * of the number of free pages in the lower zones.  This interoperates with
+ * the page allocator fallback scheme to ensure that aging of pages is balanced
+ * across the zones.
+ */
+static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+{
+	int to_free = nr_pages;
+	int all_zones_ok;
+	int priority;
+	int i;
+	int total_scanned, total_reclaimed;
+	struct reclaim_state *reclaim_state = current->reclaim_state;
+	struct scan_control sc;
+
+loop_again:
+	total_scanned = 0;
+	total_reclaimed = 0;
+	sc.gfp_mask = GFP_KERNEL;
+	sc.may_writepage = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+
+	inc_page_state(pageoutrun);
+
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->temp_priority = DEF_PRIORITY;
+	}
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+		unsigned long lru_pages = 0;
+
+		all_zones_ok = 1;
+
+		if (nr_pages == 0) {
+			/*
+			 * Scan in the highmem->dma direction for the highest
+			 * zone which needs scanning
+			 */
+			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+				struct zone *zone = pgdat->node_zones + i;
+
+				if (zone->present_pages == 0)
+					continue;
+
+				if (zone->all_unreclaimable &&
+						priority != DEF_PRIORITY)
+					continue;
+
+				if (!zone_watermark_ok(zone, order,
+						zone->pages_high, 0, 0, 0)) {
+					end_zone = i;
+					goto scan;
+				}
+			}
+			goto out;
+		} else {
+			end_zone = pgdat->nr_zones - 1;
+		}
+scan:
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			lru_pages += zone->nr_active + zone->nr_inactive;
+		}
+
+		/*
+		 * Now scan the zone in the dma->highmem direction, stopping
+		 * at the last zone which needs scanning.
+		 *
+		 * We do this because the page allocator works in the opposite
+		 * direction.  This prevents the page allocator from allocating
+		 * pages behind kswapd's direction of progress, which would
+		 * cause too much scanning of the lower zones.
+		 */
+		for (i = 0; i <= end_zone; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+
+			if (zone->present_pages == 0)
+				continue;
+
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
+
+			if (nr_pages == 0) {	/* Not software suspend */
+				if (!zone_watermark_ok(zone, order,
+						zone->pages_high, end_zone, 0, 0))
+					all_zones_ok = 0;
+			}
+			zone->temp_priority = priority;
+			if (zone->prev_priority > priority)
+				zone->prev_priority = priority;
+			sc.nr_scanned = 0;
+			sc.nr_reclaimed = 0;
+			sc.priority = priority;
+			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+			shrink_zone(zone, &sc);
+			reclaim_state->reclaimed_slab = 0;
+			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			total_reclaimed += sc.nr_reclaimed;
+			total_scanned += sc.nr_scanned;
+			if (zone->all_unreclaimable)
+				continue;
+			if (zone->pages_scanned >= (zone->nr_active +
+							zone->nr_inactive) * 4)
+				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > total_reclaimed+total_reclaimed/2)
+				sc.may_writepage = 1;
+		}
+		if (nr_pages && to_free > total_reclaimed)
+			continue;	/* swsusp: need to do more work */
+		if (all_zones_ok)
+			break;		/* kswapd: all done */
+		/*
+		 * OK, kswapd is getting into trouble.  Take a nap, then take
+		 * another pass across the zones.
+		 */
+		if (total_scanned && priority < DEF_PRIORITY - 2)
+			blk_congestion_wait(WRITE, HZ/10);
+
+		/*
+		 * We do this so kswapd doesn't build up large priorities for
+		 * example when it is freeing in parallel with allocators. It
+		 * matches the direct reclaim path behaviour in terms of impact
+		 * on zone->*_priority.
+		 */
+		if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+			break;
+	}
+out:
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		zone->prev_priority = zone->temp_priority;
+	}
+	if (!all_zones_ok) {
+		cond_resched();
+		goto loop_again;
+	}
+
+	return total_reclaimed;
+}
+
+/*
+ * The background pageout daemon, started as a kernel thread
+ * from the init process. 
+ *
+ * This basically trickles out pages so that we have _some_
+ * free memory available even if there is no other activity
+ * that frees anything up. This is needed for things like routing
+ * etc, where we otherwise might have all activity going on in
+ * asynchronous contexts that cannot page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
+ */
+static int kswapd(void *p)
+{
+	unsigned long order;
+	pg_data_t *pgdat = (pg_data_t*)p;
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	struct reclaim_state reclaim_state = {
+		.reclaimed_slab = 0,
+	};
+	cpumask_t cpumask;
+
+	daemonize("kswapd%d", pgdat->node_id);
+	cpumask = node_to_cpumask(pgdat->node_id);
+	if (!cpus_empty(cpumask))
+		set_cpus_allowed(tsk, cpumask);
+	current->reclaim_state = &reclaim_state;
+
+	/*
+	 * Tell the memory management that we're a "memory allocator",
+	 * and that if we need more memory we should get access to it
+	 * regardless (see "__alloc_pages()"). "kswapd" should
+	 * never get caught in the normal page freeing logic.
+	 *
+	 * (Kswapd normally doesn't need memory anyway, but sometimes
+	 * you need a small amount of memory in order to be able to
+	 * page out something else, and this flag essentially protects
+	 * us from recursively trying to free more memory as we're
+	 * trying to free the first piece of memory in the first place).
+	 */
+	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+
+	order = 0;
+	for ( ; ; ) {
+		unsigned long new_order;
+		if (current->flags & PF_FREEZE)
+			refrigerator(PF_FREEZE);
+
+		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+		new_order = pgdat->kswapd_max_order;
+		pgdat->kswapd_max_order = 0;
+		if (order < new_order) {
+			/*
+			 * Don't sleep if someone wants a larger 'order'
+			 * allocation
+			 */
+			order = new_order;
+		} else {
+			schedule();
+			order = pgdat->kswapd_max_order;
+		}
+		finish_wait(&pgdat->kswapd_wait, &wait);
+
+		balance_pgdat(pgdat, 0, order);
+	}
+	return 0;
+}
+
+/*
+ * A zone is low on free memory, so wake its kswapd task to service it.
+ */
+void wakeup_kswapd(struct zone *zone, int order)
+{
+	pg_data_t *pgdat;
+
+	if (zone->present_pages == 0)
+		return;
+
+	pgdat = zone->zone_pgdat;
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+		return;
+	if (pgdat->kswapd_max_order < order)
+		pgdat->kswapd_max_order = order;
+	if (!cpuset_zone_allowed(zone))
+		return;
+	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+		return;
+	wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
+}
+
+#ifdef CONFIG_PM
+/*
+ * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * pages.
+ */
+int shrink_all_memory(int nr_pages)
+{
+	pg_data_t *pgdat;
+	int nr_to_free = nr_pages;
+	int ret = 0;
+	struct reclaim_state reclaim_state = {
+		.reclaimed_slab = 0,
+	};
+
+	current->reclaim_state = &reclaim_state;
+	for_each_pgdat(pgdat) {
+		int freed;
+		freed = balance_pgdat(pgdat, nr_to_free, 0);
+		ret += freed;
+		nr_to_free -= freed;
+		if (nr_to_free <= 0)
+			break;
+	}
+	current->reclaim_state = NULL;
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* It's optimal to keep kswapds on the same CPUs as their memory, but
+   not required for correctness.  So if the last cpu in a node goes
+   away, we get changed to run anywhere: as the first one comes back,
+   restore their cpu bindings. */
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	pg_data_t *pgdat;
+	cpumask_t mask;
+
+	if (action == CPU_ONLINE) {
+		for_each_pgdat(pgdat) {
+			mask = node_to_cpumask(pgdat->node_id);
+			if (any_online_cpu(mask) != NR_CPUS)
+				/* One of our CPUs online: restore mask */
+				set_cpus_allowed(pgdat->kswapd, mask);
+		}
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __init kswapd_init(void)
+{
+	pg_data_t *pgdat;
+	swap_setup();
+	for_each_pgdat(pgdat)
+		pgdat->kswapd
+		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+	total_memory = nr_free_pagecache_pages();
+	hotcpu_notifier(cpu_callback, 0);
+	return 0;
+}
+
+module_init(kswapd_init)
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /mm
download	linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2