Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/mm
download: linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2
9 files changed, 2592 insertions, 0 deletions
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
new file mode 100644
index 000000000000..7078f67887ec
--- /dev/null
+++ b/arch/ia64/mm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the ia64-specific parts of the memory manager.
+#
+
+obj-y := init.o fault.o tlb.o extable.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_NUMA)	   += numa.o
+obj-$(CONFIG_DISCONTIGMEM) += discontig.o
+ifndef CONFIG_DISCONTIGMEM
+obj-y += contig.o
+endif
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
new file mode 100644
index 000000000000..6daf15ac8940
--- /dev/null
+++ b/arch/ia64/mm/contig.c
@@ -0,0 +1,299 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
+ *
+ * Routines used by ia64 machines with contiguous (or virtually contiguous)
+ * memory.
+ */
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <asm/meminit.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mca.h>
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static unsigned long num_dma_physpages;
+#endif
+
+/**
+ * show_mem - display a memory statistics summary
+ *
+ * Just walks the pages in the system and describes where they're allocated.
+ */
+void
+show_mem (void)
+{
+	int i, total = 0, reserved = 0;
+	int shared = 0, cached = 0;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+
+	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	i = max_mapnr;
+	while (i-- > 0) {
+		if (!pfn_valid(i))
+			continue;
+		total++;
+		if (PageReserved(mem_map+i))
+			reserved++;
+		else if (PageSwapCache(mem_map+i))
+			cached++;
+		else if (page_count(mem_map + i))
+			shared += page_count(mem_map + i) - 1;
+	}
+	printk("%d pages of RAM\n", total);
+	printk("%d reserved pages\n", reserved);
+	printk("%d pages shared\n", shared);
+	printk("%d pages swap cached\n", cached);
+	printk("%ld pages in page table cache\n", pgtable_cache_size);
+}
+
+/* physical address where the bootmem map is located */
+unsigned long bootmap_start;
+
+/**
+ * find_max_pfn - adjust the maximum page number callback
+ * @start: start of range
+ * @end: end of range
+ * @arg: address of pointer to global max_pfn variable
+ *
+ * Passed as a callback function to efi_memmap_walk() to determine the highest
+ * available page frame number in the system.
+ */
+int
+find_max_pfn (unsigned long start, unsigned long end, void *arg)
+{
+	unsigned long *max_pfnp = arg, pfn;
+
+	pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT;
+	if (pfn > *max_pfnp)
+		*max_pfnp = pfn;
+	return 0;
+}
+
+/**
+ * find_bootmap_location - callback to find a memory area for the bootmap
+ * @start: start of region
+ * @end: end of region
+ * @arg: unused callback data
+ *
+ * Find a place to put the bootmap and return its starting address in
+ * bootmap_start.  This address must be page-aligned.
+ */
+int
+find_bootmap_location (unsigned long start, unsigned long end, void *arg)
+{
+	unsigned long needed = *(unsigned long *)arg;
+	unsigned long range_start, range_end, free_start;
+	int i;
+
+#if IGNORE_PFN0
+	if (start == PAGE_OFFSET) {
+		start += PAGE_SIZE;
+		if (start >= end)
+			return 0;
+	}
+#endif
+
+	free_start = PAGE_OFFSET;
+
+	for (i = 0; i < num_rsvd_regions; i++) {
+		range_start = max(start, free_start);
+		range_end   = min(end, rsvd_region[i].start & PAGE_MASK);
+
+		free_start = PAGE_ALIGN(rsvd_region[i].end);
+
+		if (range_end <= range_start)
+			continue; /* skip over empty range */
+
+		if (range_end - range_start >= needed) {
+			bootmap_start = __pa(range_start);
+			return -1;	/* done */
+		}
+
+		/* nothing more available in this segment */
+		if (range_end == end)
+			return 0;
+	}
+	return 0;
+}
+
+/**
+ * find_memory - setup memory map
+ *
+ * Walk the EFI memory map and find usable memory for the system, taking
+ * into account reserved areas.
+ */
+void
+find_memory (void)
+{
+	unsigned long bootmap_size;
+
+	reserve_memory();
+
+	/* first find highest page frame number */
+	max_pfn = 0;
+	efi_memmap_walk(find_max_pfn, &max_pfn);
+
+	/* how many bytes to cover all the pages */
+	bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT;
+
+	/* look for a location to hold the bootmap */
+	bootmap_start = ~0UL;
+	efi_memmap_walk(find_bootmap_location, &bootmap_size);
+	if (bootmap_start == ~0UL)
+		panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
+
+	bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
+
+	/* Free all available memory, then mark bootmem-map as being in use. */
+	efi_memmap_walk(filter_rsvd_memory, free_bootmem);
+	reserve_bootmem(bootmap_start, bootmap_size);
+
+	find_initrd();
+}
+
+#ifdef CONFIG_SMP
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * Allocate and setup per-cpu data areas.
+ */
+void *
+per_cpu_init (void)
+{
+	void *cpu_data;
+	int cpu;
+
+	/*
+	 * get_free_pages() cannot be used before cpu_init() done.  BSP
+	 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
+	 * get_zeroed_page().
+	 */
+	if (smp_processor_id() == 0) {
+		cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
+					   PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
+			__per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
+			cpu_data += PERCPU_PAGE_SIZE;
+			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+		}
+	}
+	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+#endif /* CONFIG_SMP */
+
+static int
+count_pages (u64 start, u64 end, void *arg)
+{
+	unsigned long *count = arg;
+
+	*count += (end - start) >> PAGE_SHIFT;
+	return 0;
+}
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static int
+count_dma_pages (u64 start, u64 end, void *arg)
+{
+	unsigned long *count = arg;
+
+	if (start < MAX_DMA_ADDRESS)
+		*count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
+	return 0;
+}
+#endif
+
+/*
+ * Set up the page tables.
+ */
+
+void
+paging_init (void)
+{
+	unsigned long max_dma;
+	unsigned long zones_size[MAX_NR_ZONES];
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+	unsigned long zholes_size[MAX_NR_ZONES];
+	unsigned long max_gap;
+#endif
+
+	/* initialize mem_map[] */
+
+	memset(zones_size, 0, sizeof(zones_size));
+
+	num_physpages = 0;
+	efi_memmap_walk(count_pages, &num_physpages);
+
+	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+	memset(zholes_size, 0, sizeof(zholes_size));
+
+	num_dma_physpages = 0;
+	efi_memmap_walk(count_dma_pages, &num_dma_physpages);
+
+	if (max_low_pfn < max_dma) {
+		zones_size[ZONE_DMA] = max_low_pfn;
+		zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
+	} else {
+		zones_size[ZONE_DMA] = max_dma;
+		zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
+		if (num_physpages > num_dma_physpages) {
+			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+			zholes_size[ZONE_NORMAL] =
+				((max_low_pfn - max_dma) -
+				 (num_physpages - num_dma_physpages));
+		}
+	}
+
+	max_gap = 0;
+	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
+	if (max_gap < LARGE_GAP) {
+		vmem_map = (struct page *) 0;
+		free_area_init_node(0, &contig_page_data, zones_size, 0,
+				    zholes_size);
+	} else {
+		unsigned long map_size;
+
+		/* allocate virtual_mem_map */
+
+		map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+		vmalloc_end -= map_size;
+		vmem_map = (struct page *) vmalloc_end;
+		efi_memmap_walk(create_mem_map_page_table, NULL);
+
+		NODE_DATA(0)->node_mem_map = vmem_map;
+		free_area_init_node(0, &contig_page_data, zones_size,
+				    0, zholes_size);
+
+		printk("Virtual mem_map starts at 0x%p\n", mem_map);
+	}
+#else /* !CONFIG_VIRTUAL_MEM_MAP */
+	if (max_low_pfn < max_dma)
+		zones_size[ZONE_DMA] = max_low_pfn;
+	else {
+		zones_size[ZONE_DMA] = max_dma;
+		zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+	}
+	free_area_init(zones_size);
+#endif /* !CONFIG_VIRTUAL_MEM_MAP */
+	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
new file mode 100644
index 000000000000..3456a9b6971e
--- /dev/null
+++ b/arch/ia64/mm/discontig.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ * Copyright (c) 2004 Silicon Graphics, Inc
+ *	Russ Anderson <rja@sgi.com>
+ *	Jesse Barnes <jbarnes@sgi.com>
+ *	Jack Steiner <steiner@sgi.com>
+ */
+
+/*
+ * Platform initialization for Discontig Memory
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <linux/nodemask.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/meminit.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+
+/*
+ * Track per-node information needed to setup the boot memory allocator, the
+ * per-node areas, and the real VM.
+ */
+struct early_node_data {
+	struct ia64_node_data *node_data;
+	pg_data_t *pgdat;
+	unsigned long pernode_addr;
+	unsigned long pernode_size;
+	struct bootmem_data bootmem_data;
+	unsigned long num_physpages;
+	unsigned long num_dma_physpages;
+	unsigned long min_pfn;
+	unsigned long max_pfn;
+};
+
+static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
+
+/**
+ * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
+ *
+ * This function will move nodes with only CPUs (no memory)
+ * to a node with memory which is at the minimum numa_slit distance.
+ * Any reassigments will result in the compression of the nodes
+ * and renumbering the nid values where appropriate.
+ * The static declarations below are to avoid large stack size which
+ * makes the code not re-entrant.
+ */
+static void __init reassign_cpu_only_nodes(void)
+{
+	struct node_memblk_s *p;
+	int i, j, k, nnode, nid, cpu, cpunid, pxm;
+	u8 cslit, slit;
+	static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
+	static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
+	static int node_flip[MAX_NUMNODES] __initdata;
+	static int old_nid_map[NR_CPUS] __initdata;
+
+	for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+		if (!test_bit(p->nid, (void *) nodes_with_mem)) {
+			set_bit(p->nid, (void *) nodes_with_mem);
+			nnode++;
+		}
+
+	/*
+	 * All nids with memory.
+	 */
+	if (nnode == num_online_nodes())
+		return;
+
+	/*
+	 * Change nids and attempt to migrate CPU-only nodes
+	 * to the best numa_slit (closest neighbor) possible.
+	 * For reassigned CPU nodes a nid can't be arrived at
+	 * until after this loop because the target nid's new
+	 * identity might not have been established yet. So
+	 * new nid values are fabricated above num_online_nodes() and
+	 * mapped back later to their true value.
+	 */
+	/* MCD - This code is a bit complicated, but may be unnecessary now.
+	 * We can now handle much more interesting node-numbering.
+	 * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
+	 * and that there be no holes in the numbering 0..numnodes
+	 * has become simply 0 <= nid <= MAX_NUMNODES.
+	 */
+	nid = 0;
+	for_each_online_node(i)  {
+		if (test_bit(i, (void *) nodes_with_mem)) {
+			/*
+			 * Save original nid value for numa_slit
+			 * fixup and node_cpuid reassignments.
+			 */
+			node_flip[nid] = i;
+
+			if (i == nid) {
+				nid++;
+				continue;
+			}
+
+			for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+				if (p->nid == i)
+					p->nid = nid;
+
+			cpunid = nid;
+			nid++;
+		} else
+			cpunid = MAX_NUMNODES;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			if (node_cpuid[cpu].nid == i) {
+				/*
+				 * For nodes not being reassigned just
+				 * fix the cpu's nid and reverse pxm map
+				 */
+				if (cpunid < MAX_NUMNODES) {
+					pxm = nid_to_pxm_map[i];
+					pxm_to_nid_map[pxm] =
+					          node_cpuid[cpu].nid = cpunid;
+					continue;
+				}
+
+				/*
+				 * For nodes being reassigned, find best node by
+				 * numa_slit information and then make a temporary
+				 * nid value based on current nid and num_online_nodes().
+				 */
+				slit = 0xff;
+				k = 2*num_online_nodes();
+				for_each_online_node(j) {
+					if (i == j)
+						continue;
+					else if (test_bit(j, (void *) nodes_with_mem)) {
+						cslit = numa_slit[i * num_online_nodes() + j];
+						if (cslit < slit) {
+							k = num_online_nodes() + j;
+							slit = cslit;
+						}
+					}
+				}
+
+				/* save old nid map so we can update the pxm */
+				old_nid_map[cpu] = node_cpuid[cpu].nid;
+				node_cpuid[cpu].nid = k;
+			}
+	}
+
+	/*
+	 * Fixup temporary nid values for CPU-only nodes.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
+			pxm = nid_to_pxm_map[old_nid_map[cpu]];
+			pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
+		} else {
+			for (i = 0; i < nnode; i++) {
+				if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
+					continue;
+
+				pxm = nid_to_pxm_map[old_nid_map[cpu]];
+				pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
+				break;
+			}
+		}
+
+	/*
+	 * Fix numa_slit by compressing from larger
+	 * nid array to reduced nid array.
+	 */
+	for (i = 0; i < nnode; i++)
+		for (j = 0; j < nnode; j++)
+			numa_slit_fix[i * nnode + j] =
+				numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
+
+	memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
+
+	nodes_clear(node_online_map);
+	for (i = 0; i < nnode; i++)
+		node_set_online(i);
+
+	return;
+}
+
+/*
+ * To prevent cache aliasing effects, align per-node structures so that they
+ * start at addresses that are strided by node number.
+ */
+#define NODEDATA_ALIGN(addr, node)						\
+	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
+
+/**
+ * build_node_maps - callback to setup bootmem structs for each node
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * We allocate a struct bootmem_data for each piece of memory that we wish to
+ * treat as a virtually contiguous block (i.e. each node). Each such block
+ * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
+ * if necessary.  Any non-existent pages will simply be part of the virtual
+ * memmap.  We also update min_low_pfn and max_low_pfn here as we receive
+ * memory ranges from the caller.
+ */
+static int __init build_node_maps(unsigned long start, unsigned long len,
+				  int node)
+{
+	unsigned long cstart, epfn, end = start + len;
+	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
+	cstart = GRANULEROUNDDOWN(start);
+
+	if (!bdp->node_low_pfn) {
+		bdp->node_boot_start = cstart;
+		bdp->node_low_pfn = epfn;
+	} else {
+		bdp->node_boot_start = min(cstart, bdp->node_boot_start);
+		bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
+	}
+
+	min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
+	max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
+
+	return 0;
+}
+
+/**
+ * early_nr_phys_cpus_node - return number of physical cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of physical cpus on @node.  These are cpus that actually
+ * exist.  We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.
+ */
+static int early_nr_phys_cpus_node(int node)
+{
+	int cpu, n = 0;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (node == node_cpuid[cpu].nid)
+			if ((cpu == 0) || node_cpuid[cpu].phys_id)
+				n++;
+
+	return n;
+}
+
+
+/**
+ * early_nr_cpus_node - return number of cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.  Note that node 0 will also count all non-existent cpus.
+ */
+static int early_nr_cpus_node(int node)
+{
+	int cpu, n = 0;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (node == node_cpuid[cpu].nid)
+			n++;
+
+	return n;
+}
+
+/**
+ * find_pernode_space - allocate memory for memory map and per-node structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * This routine reserves space for the per-cpu data struct, the list of
+ * pg_data_ts and the per-node data struct.  Each node will have something like
+ * the following in the first chunk of addr. space large enough to hold it.
+ *
+ *    ________________________
+ *   |                        |
+ *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
+ *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
+ *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
+ *   |------------------------|
+ *   |   local pg_data_t *    |
+ *   |------------------------|
+ *   |  local ia64_node_data  |
+ *   |------------------------|
+ *   |          ???           |
+ *   |________________________|
+ *
+ * Once this space has been set aside, the bootmem maps are initialized.  We
+ * could probably move the allocation of the per-cpu and ia64_node_data space
+ * outside of this function and use alloc_bootmem_node(), but doing it here
+ * is straightforward and we get the alignments we want so...
+ */
+static int __init find_pernode_space(unsigned long start, unsigned long len,
+				     int node)
+{
+	unsigned long epfn, cpu, cpus, phys_cpus;
+	unsigned long pernodesize = 0, pernode, pages, mapsize;
+	void *cpu_data;
+	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+	epfn = (start + len) >> PAGE_SHIFT;
+
+	pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
+	mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+
+	/*
+	 * Make sure this memory falls within this node's usable memory
+	 * since we may have thrown some away in build_maps().
+	 */
+	if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
+		return 0;
+
+	/* Don't setup this node's local space twice... */
+	if (mem_data[node].pernode_addr)
+		return 0;
+
+	/*
+	 * Calculate total size needed, incl. what's necessary
+	 * for good alignment and alias prevention.
+	 */
+	cpus = early_nr_cpus_node(node);
+	phys_cpus = early_nr_phys_cpus_node(node);
+	pernodesize += PERCPU_PAGE_SIZE * cpus;
+	pernodesize += node * L1_CACHE_BYTES;
+	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+	pernodesize = PAGE_ALIGN(pernodesize);
+	pernode = NODEDATA_ALIGN(start, node);
+
+	/* Is this range big enough for what we want to store here? */
+	if (start + len > (pernode + pernodesize + mapsize)) {
+		mem_data[node].pernode_addr = pernode;
+		mem_data[node].pernode_size = pernodesize;
+		memset(__va(pernode), 0, pernodesize);
+
+		cpu_data = (void *)pernode;
+		pernode += PERCPU_PAGE_SIZE * cpus;
+		pernode += node * L1_CACHE_BYTES;
+
+		mem_data[node].pgdat = __va(pernode);
+		pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+		mem_data[node].node_data = __va(pernode);
+		pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+
+		mem_data[node].pgdat->bdata = bdp;
+		pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+		/*
+		 * Copy the static per-cpu data into the region we
+		 * just set aside and then setup __per_cpu_offset
+		 * for each CPU on this node.
+		 */
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (node == node_cpuid[cpu].nid) {
+				memcpy(__va(cpu_data), __phys_per_cpu_start,
+				       __per_cpu_end - __per_cpu_start);
+				__per_cpu_offset[cpu] = (char*)__va(cpu_data) -
+					__per_cpu_start;
+				cpu_data += PERCPU_PAGE_SIZE;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * free_node_bootmem - free bootmem allocator memory for use
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Simply calls the bootmem allocator to free the specified ranged from
+ * the given pg_data_t's bdata struct.  After this function has been called
+ * for all the entries in the EFI memory map, the bootmem allocator will
+ * be ready to service allocation requests.
+ */
+static int __init free_node_bootmem(unsigned long start, unsigned long len,
+				    int node)
+{
+	free_bootmem_node(mem_data[node].pgdat, start, len);
+
+	return 0;
+}
+
+/**
+ * reserve_pernode_space - reserve memory for per-node space
+ *
+ * Reserve the space used by the bootmem maps & per-node space in the boot
+ * allocator so that when we actually create the real mem maps we don't
+ * use their memory.
+ */
+static void __init reserve_pernode_space(void)
+{
+	unsigned long base, size, pages;
+	struct bootmem_data *bdp;
+	int node;
+
+	for_each_online_node(node) {
+		pg_data_t *pdp = mem_data[node].pgdat;
+
+		bdp = pdp->bdata;
+
+		/* First the bootmem_map itself */
+		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+		size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+		base = __pa(bdp->node_bootmem_map);
+		reserve_bootmem_node(pdp, base, size);
+
+		/* Now the per-node space */
+		size = mem_data[node].pernode_size;
+		base = __pa(mem_data[node].pernode_addr);
+		reserve_bootmem_node(pdp, base, size);
+	}
+}
+
+/**
+ * initialize_pernode_data - fixup per-cpu & per-node pointers
+ *
+ * Each node's per-node area has a copy of the global pg_data_t list, so
+ * we copy that to each node here, as well as setting the per-cpu pointer
+ * to the local node data structure.  The active_cpus field of the per-node
+ * structure gets setup by the platform_cpu_init() function later.
+ */
+static void __init initialize_pernode_data(void)
+{
+	int cpu, node;
+	pg_data_t *pgdat_list[MAX_NUMNODES];
+
+	for_each_online_node(node)
+		pgdat_list[node] = mem_data[node].pgdat;
+
+	/* Copy the pg_data_t list to each node and init the node field */
+	for_each_online_node(node) {
+		memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
+		       sizeof(pgdat_list));
+	}
+
+	/* Set the node_data pointer for each per-cpu struct */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		node = node_cpuid[cpu].nid;
+		per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
+	}
+}
+
+/**
+ * find_memory - walk the EFI memory map and setup the bootmem allocator
+ *
+ * Called early in boot to setup the bootmem allocator, and to
+ * allocate the per-cpu and per-node structures.
+ */
+void __init find_memory(void)
+{
+	int node;
+
+	reserve_memory();
+
+	if (num_online_nodes() == 0) {
+		printk(KERN_ERR "node info missing!\n");
+		node_set_online(0);
+	}
+
+	min_low_pfn = -1;
+	max_low_pfn = 0;
+
+	if (num_online_nodes() > 1)
+		reassign_cpu_only_nodes();
+
+	/* These actually end up getting called by call_pernode_memory() */
+	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
+	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
+
+	/*
+	 * Initialize the boot memory maps in reverse order since that's
+	 * what the bootmem allocator expects
+	 */
+	for (node = MAX_NUMNODES - 1; node >= 0; node--) {
+		unsigned long pernode, pernodesize, map;
+		struct bootmem_data *bdp;
+
+		if (!node_online(node))
+			continue;
+
+		bdp = &mem_data[node].bootmem_data;
+		pernode = mem_data[node].pernode_addr;
+		pernodesize = mem_data[node].pernode_size;
+		map = pernode + pernodesize;
+
+		/* Sanity check... */
+		if (!pernode)
+			panic("pernode space for node %d "
+			      "could not be allocated!", node);
+
+		init_bootmem_node(mem_data[node].pgdat,
+				  map>>PAGE_SHIFT,
+				  bdp->node_boot_start>>PAGE_SHIFT,
+				  bdp->node_low_pfn);
+	}
+
+	efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
+
+	reserve_pernode_space();
+	initialize_pernode_data();
+
+	max_pfn = max_low_pfn;
+
+	find_initrd();
+}
+
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * find_pernode_space() does most of this already, we just need to set
+ * local_per_cpu_offset
+ */
+void *per_cpu_init(void)
+{
+	int cpu;
+
+	if (smp_processor_id() == 0) {
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			per_cpu(local_per_cpu_offset, cpu) =
+				__per_cpu_offset[cpu];
+		}
+	}
+
+	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+
+/**
+ * show_mem - give short summary of memory stats
+ *
+ * Shows a simple page count of reserved and used pages in the system.
+ * For discontig machines, it does this on a per-pgdat basis.
+ */
+void show_mem(void)
+{
+	int i, total_reserved = 0;
+	int total_shared = 0, total_cached = 0;
+	unsigned long total_present = 0;
+	pg_data_t *pgdat;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	for_each_pgdat(pgdat) {
+		unsigned long present = pgdat->node_present_pages;
+		int shared = 0, cached = 0, reserved = 0;
+		printk("Node ID: %d\n", pgdat->node_id);
+		for(i = 0; i < pgdat->node_spanned_pages; i++) {
+			if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
+				continue;
+			if (PageReserved(pgdat->node_mem_map+i))
+				reserved++;
+			else if (PageSwapCache(pgdat->node_mem_map+i))
+				cached++;
+			else if (page_count(pgdat->node_mem_map+i))
+				shared += page_count(pgdat->node_mem_map+i)-1;
+		}
+		total_present += present;
+		total_reserved += reserved;
+		total_cached += cached;
+		total_shared += shared;
+		printk("\t%ld pages of RAM\n", present);
+		printk("\t%d reserved pages\n", reserved);
+		printk("\t%d pages shared\n", shared);
+		printk("\t%d pages swap cached\n", cached);
+	}
+	printk("%ld pages of RAM\n", total_present);
+	printk("%d reserved pages\n", total_reserved);
+	printk("%d pages shared\n", total_shared);
+	printk("%d pages swap cached\n", total_cached);
+	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
+	printk("%d free buffer pages\n", nr_free_buffer_pages());
+}
+
+/**
+ * call_pernode_memory - use SRAT to call callback functions with node info
+ * @start: physical start of range
+ * @len: length of range
+ * @arg: function to call for each range
+ *
+ * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
+ * out to which node a block of memory belongs.  Ignore memory that we cannot
+ * identify, and split blocks that run across multiple nodes.
+ *
+ * Take this opportunity to round the start address up and the end address
+ * down to page boundaries.
+ */
+void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
+{
+	unsigned long rs, re, end = start + len;
+	void (*func)(unsigned long, unsigned long, int);
+	int i;
+
+	start = PAGE_ALIGN(start);
+	end &= PAGE_MASK;
+	if (start >= end)
+		return;
+
+	func = arg;
+
+	if (!num_node_memblks) {
+		/* No SRAT table, so assume one node (node 0) */
+		if (start < end)
+			(*func)(start, end - start, 0);
+		return;
+	}
+
+	for (i = 0; i < num_node_memblks; i++) {
+		rs = max(start, node_memblk[i].start_paddr);
+		re = min(end, node_memblk[i].start_paddr +
+			 node_memblk[i].size);
+
+		if (rs < re)
+			(*func)(rs, re - rs, node_memblk[i].nid);
+
+		if (re == end)
+			break;
+	}
+}
+
+/**
+ * count_node_pages - callback to build per-node memory info structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Each node has it's own number of physical pages, DMAable pages, start, and
+ * end page frame number.  This routine will be called by call_pernode_memory()
+ * for each piece of usable memory and will setup these values for each node.
+ * Very similar to build_maps().
+ */
+static __init int count_node_pages(unsigned long start, unsigned long len, int node)
+{
+	unsigned long end = start + len;
+
+	mem_data[node].num_physpages += len >> PAGE_SHIFT;
+	if (start <= __pa(MAX_DMA_ADDRESS))
+		mem_data[node].num_dma_physpages +=
+			(min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
+	start = GRANULEROUNDDOWN(start);
+	start = ORDERROUNDDOWN(start);
+	end = GRANULEROUNDUP(end);
+	mem_data[node].max_pfn = max(mem_data[node].max_pfn,
+				     end >> PAGE_SHIFT);
+	mem_data[node].min_pfn = min(mem_data[node].min_pfn,
+				     start >> PAGE_SHIFT);
+
+	return 0;
+}
+
+/**
+ * paging_init - setup page tables
+ *
+ * paging_init() sets up the page tables for each node of the system and frees
+ * the bootmem allocator memory for general use.
+ */
+void __init paging_init(void)
+{
+	unsigned long max_dma;
+	unsigned long zones_size[MAX_NR_ZONES];
+	unsigned long zholes_size[MAX_NR_ZONES];
+	unsigned long pfn_offset = 0;
+	int node;
+
+	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+	/* so min() will work in count_node_pages */
+	for_each_online_node(node)
+		mem_data[node].min_pfn = ~0UL;
+
+	efi_memmap_walk(filter_rsvd_memory, count_node_pages);
+
+	for_each_online_node(node) {
+		memset(zones_size, 0, sizeof(zones_size));
+		memset(zholes_size, 0, sizeof(zholes_size));
+
+		num_physpages += mem_data[node].num_physpages;
+
+		if (mem_data[node].min_pfn >= max_dma) {
+			/* All of this node's memory is above ZONE_DMA */
+			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn -
+				mem_data[node].num_physpages;
+		} else if (mem_data[node].max_pfn < max_dma) {
+			/* All of this node's memory is in ZONE_DMA */
+			zones_size[ZONE_DMA] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
+				mem_data[node].min_pfn -
+				mem_data[node].num_dma_physpages;
+		} else {
+			/* This node has memory in both zones */
+			zones_size[ZONE_DMA] = max_dma -
+				mem_data[node].min_pfn;
+			zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+				mem_data[node].num_dma_physpages;
+			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+				max_dma;
+			zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
+				(mem_data[node].num_physpages -
+				 mem_data[node].num_dma_physpages);
+		}
+
+		if (node == 0) {
+			vmalloc_end -=
+				PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+			vmem_map = (struct page *) vmalloc_end;
+
+			efi_memmap_walk(create_mem_map_page_table, NULL);
+			printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+		}
+
+		pfn_offset = mem_data[node].min_pfn;
+
+		NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
+		free_area_init_node(node, NODE_DATA(node), zones_size,
+				    pfn_offset, zholes_size);
+	}
+
+	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
new file mode 100644
index 000000000000..6d259e34f359
--- /dev/null
+++ b/arch/ia64/mm/extable.c
@@ -0,0 +1,90 @@
+/*
+ * Kernel exception handling table support.  Derived from arch/alpha/mm/extable.c.
+ *
+ * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <linux/config.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <asm/module.h>
+
+static int cmp_ex(const void *a, const void *b)
+{
+	const struct exception_table_entry *l = a, *r = b;
+	u64 lip = (u64) &l->addr + l->addr;
+	u64 rip = (u64) &r->addr + r->addr;
+
+	/* avoid overflow */
+	if (lip > rip)
+		return 1;
+	if (lip < rip)
+		return -1;
+	return 0;
+}
+
+static void swap_ex(void *a, void *b, int size)
+{
+	struct exception_table_entry *l = a, *r = b, tmp;
+	u64 delta = (u64) r - (u64) l;
+
+	tmp = *l;
+	l->addr = r->addr + delta;
+	l->cont = r->cont + delta;
+	r->addr = tmp.addr - delta;
+	r->cont = tmp.cont - delta;
+}
+
+/*
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
+ */
+void sort_extable (struct exception_table_entry *start,
+		   struct exception_table_entry *finish)
+{
+	sort(start, finish - start, sizeof(struct exception_table_entry),
+	     cmp_ex, swap_ex);
+}
+
+const struct exception_table_entry *
+search_extable (const struct exception_table_entry *first,
+		const struct exception_table_entry *last,
+		unsigned long ip)
+{
+	const struct exception_table_entry *mid;
+	unsigned long mid_ip;
+	long diff;
+
+        while (first <= last) {
+		mid = &first[(last - first)/2];
+		mid_ip = (u64) &mid->addr + mid->addr;
+		diff = mid_ip - ip;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid + 1;
+                else
+                        last = mid - 1;
+        }
+        return NULL;
+}
+
+void
+ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
+{
+	long fix = (u64) &e->cont + e->cont;
+
+	regs->r8 = -EFAULT;
+	if (fix & 4)
+		regs->r9 = 0;
+	regs->cr_iip = fix & ~0xf;
+	ia64_psr(regs)->ri = fix & 0x3;		/* set continuation slot number */
+}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
new file mode 100644
index 000000000000..da859125aaef
--- /dev/null
+++ b/arch/ia64/mm/fault.c
@@ -0,0 +1,261 @@
+/*
+ * MMU fault handling support.
+ *
+ * Copyright (C) 1998-2002 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+extern void die (char *, struct pt_regs *, long);
+
+/*
+ * This routine is analogous to expand_stack() but instead grows the
+ * register backing store (which grows towards higher addresses).
+ * Since the register backing store is access sequentially, we
+ * disallow growing the RBS by more than a page at a time.  Note that
+ * the VM_GROWSUP flag can be set on any VM area but that's fine
+ * because the total process size is still limited by RLIMIT_STACK and
+ * RLIMIT_AS.
+ */
+static inline long
+expand_backing_store (struct vm_area_struct *vma, unsigned long address)
+{
+	unsigned long grow;
+
+	grow = PAGE_SIZE >> PAGE_SHIFT;
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
+		return -ENOMEM;
+	vma->vm_end += PAGE_SIZE;
+	vma->vm_mm->total_vm += grow;
+	if (vma->vm_flags & VM_LOCKED)
+		vma->vm_mm->locked_vm += grow;
+	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+	return 0;
+}
+
+/*
+ * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
+ * (inside region 5, on ia64) and that page is present.
+ */
+static int
+mapped_kernel_page_is_present (unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+
+	pgd = pgd_offset_k(address);
+	if (pgd_none(*pgd) || pgd_bad(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || pud_bad(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || pmd_bad(*pmd))
+		return 0;
+
+	ptep = pte_offset_kernel(pmd, address);
+	if (!ptep)
+		return 0;
+
+	pte = *ptep;
+	return pte_present(pte);
+}
+
+void
+ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
+{
+	int signal = SIGSEGV, code = SEGV_MAPERR;
+	struct vm_area_struct *vma, *prev_vma;
+	struct mm_struct *mm = current->mm;
+	struct siginfo si;
+	unsigned long mask;
+
+	/*
+	 * If we're in an interrupt or have no user context, we must not take the fault..
+	 */
+	if (in_atomic() || !mm)
+		goto no_context;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+	/*
+	 * If fault is in region 5 and we are in the kernel, we may already
+	 * have the mmap_sem (pfn_valid macro is called during mmap). There
+	 * is no vma for region 5 addr's anyway, so skip getting the semaphore
+	 * and go directly to the exception handling code.
+	 */
+
+	if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
+		goto bad_area_no_up;
+#endif
+
+	down_read(&mm->mmap_sem);
+
+	vma = find_vma_prev(mm, address, &prev_vma);
+	if (!vma)
+		goto bad_area;
+
+	/* find_vma_prev() returns vma such that address < vma->vm_end or NULL */
+	if (address < vma->vm_start)
+		goto check_expansion;
+
+  good_area:
+	code = SEGV_ACCERR;
+
+	/* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+
+#	define VM_READ_BIT	0
+#	define VM_WRITE_BIT	1
+#	define VM_EXEC_BIT	2
+
+#	if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
+	    || (1 << VM_EXEC_BIT) != VM_EXEC)
+#		error File is out of sync with <linux/mm.h>.  Please update.
+#	endif
+
+	mask = (  (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
+		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
+		| (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
+
+	if ((vma->vm_flags & mask) != mask)
+		goto bad_area;
+
+  survive:
+	/*
+	 * If for any reason at all we couldn't handle the fault, make
+	 * sure we exit gracefully rather than endlessly redo the
+	 * fault.
+	 */
+	switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
+	      case VM_FAULT_MINOR:
+		++current->min_flt;
+		break;
+	      case VM_FAULT_MAJOR:
+		++current->maj_flt;
+		break;
+	      case VM_FAULT_SIGBUS:
+		/*
+		 * We ran out of memory, or some other thing happened
+		 * to us that made us unable to handle the page fault
+		 * gracefully.
+		 */
+		signal = SIGBUS;
+		goto bad_area;
+	      case VM_FAULT_OOM:
+		goto out_of_memory;
+	      default:
+		BUG();
+	}
+	up_read(&mm->mmap_sem);
+	return;
+
+  check_expansion:
+	if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+		if (!(vma->vm_flags & VM_GROWSDOWN))
+			goto bad_area;
+		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+			goto bad_area;
+		if (expand_stack(vma, address))
+			goto bad_area;
+	} else {
+		vma = prev_vma;
+		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+			goto bad_area;
+		if (expand_backing_store(vma, address))
+			goto bad_area;
+	}
+	goto good_area;
+
+  bad_area:
+	up_read(&mm->mmap_sem);
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+  bad_area_no_up:
+#endif
+	if ((isr & IA64_ISR_SP)
+	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+	{
+		/*
+		 * This fault was due to a speculative load or lfetch.fault, set the "ed"
+		 * bit in the psr to ensure forward progress.  (Target register will get a
+		 * NaT for ld.s, lfetch will be canceled.)
+		 */
+		ia64_psr(regs)->ed = 1;
+		return;
+	}
+	if (user_mode(regs)) {
+		si.si_signo = signal;
+		si.si_errno = 0;
+		si.si_code = code;
+		si.si_addr = (void __user *) address;
+		si.si_isr = isr;
+		si.si_flags = __ISR_VALID;
+		force_sig_info(signal, &si, current);
+		return;
+	}
+
+  no_context:
+	if (isr & IA64_ISR_SP) {
+		/*
+		 * This fault was due to a speculative load set the "ed" bit in the psr to
+		 * ensure forward progress (target register will get a NaT).
+		 */
+		ia64_psr(regs)->ed = 1;
+		return;
+	}
+
+	if (ia64_done_with_exception(regs))
+		return;
+
+	/*
+	 * Since we have no vma's for region 5, we might get here even if the address is
+	 * valid, due to the VHPT walker inserting a non present translation that becomes
+	 * stale. If that happens, the non present fault handler already purged the stale
+	 * translation, which fixed the problem. So, we check to see if the translation is
+	 * valid, and return if it is.
+	 */
+	if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
+		return;
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to terminate things
+	 * with extreme prejudice.
+	 */
+	bust_spinlocks(1);
+
+	if (address < PAGE_SIZE)
+		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
+	else
+		printk(KERN_ALERT "Unable to handle kernel paging request at "
+		       "virtual address %016lx\n", address);
+	die("Oops", regs, isr);
+	bust_spinlocks(0);
+	do_exit(SIGKILL);
+	return;
+
+  out_of_memory:
+	up_read(&mm->mmap_sem);
+	if (current->pid == 1) {
+		yield();
+		down_read(&mm->mmap_sem);
+		goto survive;
+	}
+	printk(KERN_CRIT "VM: killing process %s\n", current->comm);
+	if (user_mode(regs))
+		do_exit(SIGKILL);
+	goto no_context;
+}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
new file mode 100644
index 000000000000..40ad8328ffd5
--- /dev/null
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -0,0 +1,357 @@
+/*
+ * IA-64 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * Sep, 2003: add numa support
+ * Feb, 2004: dynamic hugetlb page size via boot parameter
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
+
+static pte_t *
+huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+{
+	unsigned long taddr = htlbpage_to_page(addr);
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, taddr);
+	pud = pud_alloc(mm, pgd, taddr);
+	if (pud) {
+		pmd = pmd_alloc(mm, pud, taddr);
+		if (pmd)
+			pte = pte_alloc_map(mm, pmd, taddr);
+	}
+	return pte;
+}
+
+static pte_t *
+huge_pte_offset (struct mm_struct *mm, unsigned long addr)
+{
+	unsigned long taddr = htlbpage_to_page(addr);
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, taddr);
+	if (pgd_present(*pgd)) {
+		pud = pud_offset(pgd, taddr);
+		if (pud_present(*pud)) {
+			pmd = pmd_offset(pud, taddr);
+			if (pmd_present(*pmd))
+				pte = pte_offset_map(pmd, taddr);
+		}
+	}
+
+	return pte;
+}
+
+#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
+
+static void
+set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
+	      struct page *page, pte_t * page_table, int write_access)
+{
+	pte_t entry;
+
+	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+	if (write_access) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	entry = pte_mkyoung(entry);
+	mk_pte_huge(entry);
+	set_pte(page_table, entry);
+	return;
+}
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	if (REGION_NUMBER(addr) != REGION_HPAGE)
+		return -EINVAL;
+
+	return 0;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		set_pte(dst_pte, entry);
+		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+nomem:
+	return -ENOMEM;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		    struct page **pages, struct vm_area_struct **vmas,
+		    unsigned long *st, int *length, int i)
+{
+	pte_t *ptep, pte;
+	unsigned long start = *st;
+	unsigned long pstart;
+	int len = *length;
+	struct page *page;
+
+	do {
+		pstart = start & HPAGE_MASK;
+		ptep = huge_pte_offset(mm, start);
+		pte = *ptep;
+
+back1:
+		page = pte_page(pte);
+		if (pages) {
+			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			get_page(page);
+			pages[i] = page;
+		}
+		if (vmas)
+			vmas[i] = vma;
+		i++;
+		len--;
+		start += PAGE_SIZE;
+		if (((start & HPAGE_MASK) == pstart) && len &&
+				(start < vma->vm_end))
+			goto back1;
+	} while (len && start < vma->vm_end);
+	*length = len;
+	*st = start;
+	return i;
+}
+
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
+{
+	struct page *page;
+	pte_t *ptep;
+
+	if (REGION_NUMBER(addr) != REGION_HPAGE)
+		return ERR_PTR(-EINVAL);
+
+	ptep = huge_pte_offset(mm, addr);
+	if (!ptep || pte_none(*ptep))
+		return NULL;
+	page = pte_page(*ptep);
+	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+	return page;
+}
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+{
+	return NULL;
+}
+
+/*
+ * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
+ * are hugetlb region specific.
+ */
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+	unsigned long start, unsigned long end)
+{
+	unsigned long first = start & HUGETLB_PGDIR_MASK;
+	unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
+	struct mm_struct *mm = tlb->mm;
+
+	if (!prev) {
+		prev = mm->mmap;
+		if (!prev)
+			goto no_mmaps;
+		if (prev->vm_end > start) {
+			if (last > prev->vm_start)
+				last = prev->vm_start;
+			goto no_mmaps;
+		}
+	}
+	for (;;) {
+		struct vm_area_struct *next = prev->vm_next;
+
+		if (next) {
+			if (next->vm_start < start) {
+				prev = next;
+				continue;
+			}
+			if (last > next->vm_start)
+				last = next->vm_start;
+		}
+		if (prev->vm_end > first)
+			first = prev->vm_end;
+		break;
+	}
+no_mmaps:
+	if (last < first)	/* for arches with discontiguous pgd indices */
+		return;
+	clear_page_range(tlb, first, last);
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte;
+	struct page *page;
+
+	BUG_ON(start & (HPAGE_SIZE - 1));
+	BUG_ON(end & (HPAGE_SIZE - 1));
+
+	for (address = start; address < end; address += HPAGE_SIZE) {
+		pte = huge_pte_offset(mm, address);
+		if (pte_none(*pte))
+			continue;
+		page = pte_page(*pte);
+		put_page(page);
+		pte_clear(mm, address, pte);
+	}
+	add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
+	flush_tlb_range(vma, start, end);
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!pte_none(*pte))
+			continue;
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				page_cache_release(page);
+				goto out;
+			}
+		}
+		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct vm_area_struct *vmm;
+
+	if (len > RGN_MAP_LIMIT)
+		return -ENOMEM;
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	/* This code assumes that REGION_HPAGE != 0. */
+	if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
+		addr = HPAGE_REGION_BASE;
+	else
+		addr = ALIGN(addr, HPAGE_SIZE);
+	for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+		/* At this point:  (!vmm || addr < vmm->vm_end). */
+		if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
+			return -ENOMEM;
+		if (!vmm || (addr + len) <= vmm->vm_start)
+			return addr;
+		addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
+	}
+}
+
+static int __init hugetlb_setup_sz(char *str)
+{
+	u64 tr_pages;
+	unsigned long long size;
+
+	if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
+		/*
+		 * shouldn't happen, but just in case.
+		 */
+		tr_pages = 0x15557000UL;
+
+	size = memparse(str, &str);
+	if (*str || (size & (size-1)) || !(tr_pages & size) ||
+		size <= PAGE_SIZE ||
+		size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
+		printk(KERN_WARNING "Invalid huge page size specified\n");
+		return 1;
+	}
+
+	hpage_shift = __ffs(size);
+	/*
+	 * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
+	 * override here with new page shift.
+	 */
+	ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
+	return 1;
+}
+__setup("hugepagesz=", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
new file mode 100644
index 000000000000..65cf839573ea
--- /dev/null
+++ b/arch/ia64/mm/init.c
@@ -0,0 +1,597 @@
+/*
+ * Initialize MMU support.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/bitops.h>
+
+#include <asm/a.out.h>
+#include <asm/dma.h>
+#include <asm/ia32.h>
+#include <asm/io.h>
+#include <asm/machvec.h>
+#include <asm/numa.h>
+#include <asm/patch.h>
+#include <asm/pgalloc.h>
+#include <asm/sal.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/tlb.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/mca.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+extern void ia64_tlb_init (void);
+
+unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+unsigned long vmalloc_end = VMALLOC_END_INIT;
+EXPORT_SYMBOL(vmalloc_end);
+struct page *vmem_map;
+EXPORT_SYMBOL(vmem_map);
+#endif
+
+static int pgt_cache_water[2] = { 25, 50 };
+
+struct page *zero_page_memmap_ptr;		/* map entry for zero page */
+EXPORT_SYMBOL(zero_page_memmap_ptr);
+
+void
+check_pgt_cache (void)
+{
+	int low, high;
+
+	low = pgt_cache_water[0];
+	high = pgt_cache_water[1];
+
+	preempt_disable();
+	if (pgtable_cache_size > (u64) high) {
+		do {
+			if (pgd_quicklist)
+				free_page((unsigned long)pgd_alloc_one_fast(NULL));
+			if (pmd_quicklist)
+				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
+		} while (pgtable_cache_size > (u64) low);
+	}
+	preempt_enable();
+}
+
+void
+lazy_mmu_prot_update (pte_t pte)
+{
+	unsigned long addr;
+	struct page *page;
+
+	if (!pte_exec(pte))
+		return;				/* not an executable page... */
+
+	page = pte_page(pte);
+	addr = (unsigned long) page_address(page);
+
+	if (test_bit(PG_arch_1, &page->flags))
+		return;				/* i-cache is already coherent with d-cache */
+
+	flush_icache_range(addr, addr + PAGE_SIZE);
+	set_bit(PG_arch_1, &page->flags);	/* mark page as clean */
+}
+
+inline void
+ia64_set_rbs_bot (void)
+{
+	unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
+
+	if (stack_size > MAX_USER_STACK_SIZE)
+		stack_size = MAX_USER_STACK_SIZE;
+	current->thread.rbs_bot = STACK_TOP - stack_size;
+}
+
+/*
+ * This performs some platform-dependent address space initialization.
+ * On IA-64, we want to setup the VM area for the register backing
+ * store (which grows upwards) and install the gateway page which is
+ * used for signal trampolines, etc.
+ */
+void
+ia64_init_addr_space (void)
+{
+	struct vm_area_struct *vma;
+
+	ia64_set_rbs_bot();
+
+	/*
+	 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
+	 * the problem.  When the process attempts to write to the register backing store
+	 * for the first time, it will get a SEGFAULT in this case.
+	 */
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (vma) {
+		memset(vma, 0, sizeof(*vma));
+		vma->vm_mm = current->mm;
+		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
+		vma->vm_end = vma->vm_start + PAGE_SIZE;
+		vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
+		vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+		down_write(&current->mm->mmap_sem);
+		if (insert_vm_struct(current->mm, vma)) {
+			up_write(&current->mm->mmap_sem);
+			kmem_cache_free(vm_area_cachep, vma);
+			return;
+		}
+		up_write(&current->mm->mmap_sem);
+	}
+
+	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
+	if (!(current->personality & MMAP_PAGE_ZERO)) {
+		vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (vma) {
+			memset(vma, 0, sizeof(*vma));
+			vma->vm_mm = current->mm;
+			vma->vm_end = PAGE_SIZE;
+			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
+			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+			down_write(&current->mm->mmap_sem);
+			if (insert_vm_struct(current->mm, vma)) {
+				up_write(&current->mm->mmap_sem);
+				kmem_cache_free(vm_area_cachep, vma);
+				return;
+			}
+			up_write(&current->mm->mmap_sem);
+		}
+	}
+}
+
+void
+free_initmem (void)
+{
+	unsigned long addr, eaddr;
+
+	addr = (unsigned long) ia64_imva(__init_begin);
+	eaddr = (unsigned long) ia64_imva(__init_end);
+	while (addr < eaddr) {
+		ClearPageReserved(virt_to_page(addr));
+		set_page_count(virt_to_page(addr), 1);
+		free_page(addr);
+		++totalram_pages;
+		addr += PAGE_SIZE;
+	}
+	printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
+	       (__init_end - __init_begin) >> 10);
+}
+
+void
+free_initrd_mem (unsigned long start, unsigned long end)
+{
+	struct page *page;
+	/*
+	 * EFI uses 4KB pages while the kernel can use 4KB or bigger.
+	 * Thus EFI and the kernel may have different page sizes. It is
+	 * therefore possible to have the initrd share the same page as
+	 * the end of the kernel (given current setup).
+	 *
+	 * To avoid freeing/using the wrong page (kernel sized) we:
+	 *	- align up the beginning of initrd
+	 *	- align down the end of initrd
+	 *
+	 *  |             |
+	 *  |=============| a000
+	 *  |             |
+	 *  |             |
+	 *  |             | 9000
+	 *  |/////////////|
+	 *  |/////////////|
+	 *  |=============| 8000
+	 *  |///INITRD////|
+	 *  |/////////////|
+	 *  |/////////////| 7000
+	 *  |             |
+	 *  |KKKKKKKKKKKKK|
+	 *  |=============| 6000
+	 *  |KKKKKKKKKKKKK|
+	 *  |KKKKKKKKKKKKK|
+	 *  K=kernel using 8KB pages
+	 *
+	 * In this example, we must free page 8000 ONLY. So we must align up
+	 * initrd_start and keep initrd_end as is.
+	 */
+	start = PAGE_ALIGN(start);
+	end = end & PAGE_MASK;
+
+	if (start < end)
+		printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
+
+	for (; start < end; start += PAGE_SIZE) {
+		if (!virt_addr_valid(start))
+			continue;
+		page = virt_to_page(start);
+		ClearPageReserved(page);
+		set_page_count(page, 1);
+		free_page(start);
+		++totalram_pages;
+	}
+}
+
+/*
+ * This installs a clean page in the kernel's page table.
+ */
+struct page *
+put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (!PageReserved(page))
+		printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
+		       page_address(page));
+
+	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
+
+	spin_lock(&init_mm.page_table_lock);
+	{
+		pud = pud_alloc(&init_mm, pgd, address);
+		if (!pud)
+			goto out;
+
+		pmd = pmd_alloc(&init_mm, pud, address);
+		if (!pmd)
+			goto out;
+		pte = pte_alloc_map(&init_mm, pmd, address);
+		if (!pte)
+			goto out;
+		if (!pte_none(*pte)) {
+			pte_unmap(pte);
+			goto out;
+		}
+		set_pte(pte, mk_pte(page, pgprot));
+		pte_unmap(pte);
+	}
+  out:	spin_unlock(&init_mm.page_table_lock);
+	/* no need for flush_tlb */
+	return page;
+}
+
+static void
+setup_gate (void)
+{
+	struct page *page;
+
+	/*
+	 * Map the gate page twice: once read-only to export the ELF headers etc. and once
+	 * execute-only page to enable privilege-promotion via "epc":
+	 */
+	page = virt_to_page(ia64_imva(__start_gate_section));
+	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
+#ifdef HAVE_BUGGY_SEGREL
+	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
+	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
+#else
+	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
+#endif
+	ia64_patch_gate();
+}
+
+void __devinit
+ia64_mmu_init (void *my_cpu_data)
+{
+	unsigned long psr, pta, impl_va_bits;
+	extern void __devinit tlb_init (void);
+
+#ifdef CONFIG_DISABLE_VHPT
+#	define VHPT_ENABLE_BIT	0
+#else
+#	define VHPT_ENABLE_BIT	1
+#endif
+
+	/* Pin mapping for percpu area into TLB */
+	psr = ia64_clear_ic();
+	ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
+		 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
+		 PERCPU_PAGE_SHIFT);
+
+	ia64_set_psr(psr);
+	ia64_srlz_i();
+
+	/*
+	 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
+	 * address space.  The IA-64 architecture guarantees that at least 50 bits of
+	 * virtual address space are implemented but if we pick a large enough page size
+	 * (e.g., 64KB), the mapped address space is big enough that it will overlap with
+	 * VMLPT.  I assume that once we run on machines big enough to warrant 64KB pages,
+	 * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
+	 * problem in practice.  Alternatively, we could truncate the top of the mapped
+	 * address space to not permit mappings that would overlap with the VMLPT.
+	 * --davidm 00/12/06
+	 */
+#	define pte_bits			3
+#	define mapped_space_bits	(3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
+	/*
+	 * The virtual page table has to cover the entire implemented address space within
+	 * a region even though not all of this space may be mappable.  The reason for
+	 * this is that the Access bit and Dirty bit fault handlers perform
+	 * non-speculative accesses to the virtual page table, so the address range of the
+	 * virtual page table itself needs to be covered by virtual page table.
+	 */
+#	define vmlpt_bits		(impl_va_bits - PAGE_SHIFT + pte_bits)
+#	define POW2(n)			(1ULL << (n))
+
+	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
+
+	if (impl_va_bits < 51 || impl_va_bits > 61)
+		panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
+
+	/* place the VMLPT at the end of each page-table mapped region: */
+	pta = POW2(61) - POW2(vmlpt_bits);
+
+	if (POW2(mapped_space_bits) >= pta)
+		panic("mm/init: overlap between virtually mapped linear page table and "
+		      "mapped kernel space!");
+	/*
+	 * Set the (virtually mapped linear) page table address.  Bit
+	 * 8 selects between the short and long format, bits 2-7 the
+	 * size of the table, and bit 0 whether the VHPT walker is
+	 * enabled.
+	 */
+	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
+
+	ia64_tlb_init();
+
+#ifdef	CONFIG_HUGETLB_PAGE
+	ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
+	ia64_srlz_d();
+#endif
+}
+
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+
+int
+create_mem_map_page_table (u64 start, u64 end, void *arg)
+{
+	unsigned long address, start_page, end_page;
+	struct page *map_start, *map_end;
+	int node;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+	map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
+
+	start_page = (unsigned long) map_start & PAGE_MASK;
+	end_page = PAGE_ALIGN((unsigned long) map_end);
+	node = paddr_to_nid(__pa(start));
+
+	for (address = start_page; address < end_page; address += PAGE_SIZE) {
+		pgd = pgd_offset_k(address);
+		if (pgd_none(*pgd))
+			pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+		pud = pud_offset(pgd, address);
+
+		if (pud_none(*pud))
+			pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+		pmd = pmd_offset(pud, address);
+
+		if (pmd_none(*pmd))
+			pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+		pte = pte_offset_kernel(pmd, address);
+
+		if (pte_none(*pte))
+			set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
+					     PAGE_KERNEL));
+	}
+	return 0;
+}
+
+struct memmap_init_callback_data {
+	struct page *start;
+	struct page *end;
+	int nid;
+	unsigned long zone;
+};
+
+static int
+virtual_memmap_init (u64 start, u64 end, void *arg)
+{
+	struct memmap_init_callback_data *args;
+	struct page *map_start, *map_end;
+
+	args = (struct memmap_init_callback_data *) arg;
+	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+	map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
+
+	if (map_start < args->start)
+		map_start = args->start;
+	if (map_end > args->end)
+		map_end = args->end;
+
+	/*
+	 * We have to initialize "out of bounds" struct page elements that fit completely
+	 * on the same pages that were allocated for the "in bounds" elements because they
+	 * may be referenced later (and found to be "reserved").
+	 */
+	map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
+	map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
+		    / sizeof(struct page));
+
+	if (map_start < map_end)
+		memmap_init_zone((unsigned long)(map_end - map_start),
+				 args->nid, args->zone, page_to_pfn(map_start));
+	return 0;
+}
+
+void
+memmap_init (unsigned long size, int nid, unsigned long zone,
+	     unsigned long start_pfn)
+{
+	if (!vmem_map)
+		memmap_init_zone(size, nid, zone, start_pfn);
+	else {
+		struct page *start;
+		struct memmap_init_callback_data args;
+
+		start = pfn_to_page(start_pfn);
+		args.start = start;
+		args.end = start + size;
+		args.nid = nid;
+		args.zone = zone;
+
+		efi_memmap_walk(virtual_memmap_init, &args);
+	}
+}
+
+int
+ia64_pfn_valid (unsigned long pfn)
+{
+	char byte;
+	struct page *pg = pfn_to_page(pfn);
+
+	return     (__get_user(byte, (char __user *) pg) == 0)
+		&& ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
+			|| (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
+}
+EXPORT_SYMBOL(ia64_pfn_valid);
+
+int
+find_largest_hole (u64 start, u64 end, void *arg)
+{
+	u64 *max_gap = arg;
+
+	static u64 last_end = PAGE_OFFSET;
+
+	/* NOTE: this algorithm assumes efi memmap table is ordered */
+
+	if (*max_gap < (start - last_end))
+		*max_gap = start - last_end;
+	last_end = end;
+	return 0;
+}
+#endif /* CONFIG_VIRTUAL_MEM_MAP */
+
+static int
+count_reserved_pages (u64 start, u64 end, void *arg)
+{
+	unsigned long num_reserved = 0;
+	unsigned long *count = arg;
+
+	for (; start < end; start += PAGE_SIZE)
+		if (PageReserved(virt_to_page(start)))
+			++num_reserved;
+	*count += num_reserved;
+	return 0;
+}
+
+/*
+ * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
+ * system call handler.  When this option is in effect, all fsyscalls will end up bubbling
+ * down into the kernel and calling the normal (heavy-weight) syscall handler.  This is
+ * useful for performance testing, but conceivably could also come in handy for debugging
+ * purposes.
+ */
+
+static int nolwsys;
+
+static int __init
+nolwsys_setup (char *s)
+{
+	nolwsys = 1;
+	return 1;
+}
+
+__setup("nolwsys", nolwsys_setup);
+
+void
+mem_init (void)
+{
+	long reserved_pages, codesize, datasize, initsize;
+	unsigned long num_pgt_pages;
+	pg_data_t *pgdat;
+	int i;
+	static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
+
+#ifdef CONFIG_PCI
+	/*
+	 * This needs to be called _after_ the command line has been parsed but _before_
+	 * any drivers that may need the PCI DMA interface are initialized or bootmem has
+	 * been freed.
+	 */
+	platform_dma_init();
+#endif
+
+#ifndef CONFIG_DISCONTIGMEM
+	if (!mem_map)
+		BUG();
+	max_mapnr = max_low_pfn;
+#endif
+
+	high_memory = __va(max_low_pfn * PAGE_SIZE);
+
+	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_kernel, _stext, _end - _stext);
+
+	for_each_pgdat(pgdat)
+		totalram_pages += free_all_bootmem_node(pgdat);
+
+	reserved_pages = 0;
+	efi_memmap_walk(count_reserved_pages, &reserved_pages);
+
+	codesize =  (unsigned long) _etext - (unsigned long) _stext;
+	datasize =  (unsigned long) _edata - (unsigned long) _etext;
+	initsize =  (unsigned long) __init_end - (unsigned long) __init_begin;
+
+	printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
+	       "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
+	       num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
+	       reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
+
+	/*
+	 * Allow for enough (cached) page table pages so that we can map the entire memory
+	 * at least once.  Each task also needs a couple of page tables pages, so add in a
+	 * fudge factor for that (don't use "threads-max" here; that would be wrong!).
+	 * Don't allow the cache to be more than 10% of total memory, though.
+	 */
+#	define NUM_TASKS	500	/* typical number of tasks */
+	num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
+	if (num_pgt_pages > nr_free_pages() / 10)
+		num_pgt_pages = nr_free_pages() / 10;
+	if (num_pgt_pages > (u64) pgt_cache_water[1])
+		pgt_cache_water[1] = num_pgt_pages;
+
+	/*
+	 * For fsyscall entrpoints with no light-weight handler, use the ordinary
+	 * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
+	 * code can tell them apart.
+	 */
+	for (i = 0; i < NR_syscalls; ++i) {
+		extern unsigned long fsyscall_table[NR_syscalls];
+		extern unsigned long sys_call_table[NR_syscalls];
+
+		if (!fsyscall_table[i] || nolwsys)
+			fsyscall_table[i] = sys_call_table[i] | 1;
+	}
+	setup_gate();
+
+#ifdef CONFIG_IA32_SUPPORT
+	ia32_mem_init();
+#endif
+}
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
new file mode 100644
index 000000000000..77118bbf3d8b
--- /dev/null
+++ b/arch/ia64/mm/numa.c
@@ -0,0 +1,49 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific variables and functions which can
+ * be split away from DISCONTIGMEM and are used on NUMA machines with
+ * contiguous memory.
+ * 
+ *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
+ */
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/node.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/mmzone.h>
+#include <asm/numa.h>
+
+
+/*
+ * The following structures are usually initialized by ACPI or
+ * similar mechanisms and describe the NUMA characteristics of the machine.
+ */
+int num_node_memblks;
+struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
+struct node_cpuid_s node_cpuid[NR_CPUS];
+/*
+ * This is a matrix with "distances" between nodes, they should be
+ * proportional to the memory access latency ratios.
+ */
+u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
+
+/* Identify which cnode a physical address resides on */
+int
+paddr_to_nid(unsigned long paddr)
+{
+	int	i;
+
+	for (i = 0; i < num_node_memblks; i++)
+		if (paddr >= node_memblk[i].start_paddr &&
+		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
+			break;
+
+	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
+}
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
new file mode 100644
index 000000000000..464557e4ed82
--- /dev/null
+++ b/arch/ia64/mm/tlb.c
@@ -0,0 +1,190 @@
+/*
+ * TLB support routines.
+ *
+ * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
+ *		Modified RID allocation for SMP
+ *          Goutham Rao <goutham.rao@intel.com>
+ *              IPI based ptc implementation and A-step IPI implementation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/delay.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pal.h>
+#include <asm/tlbflush.h>
+
+static struct {
+	unsigned long mask;	/* mask of supported purge page-sizes */
+	unsigned long max_bits;	/* log2() of largest supported purge page-size */
+} purge;
+
+struct ia64_ctx ia64_ctx = {
+	.lock =		SPIN_LOCK_UNLOCKED,
+	.next =		1,
+	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
+	.max_ctx =	~0U
+};
+
+DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
+
+/*
+ * Acquire the ia64_ctx.lock before calling this function!
+ */
+void
+wrap_mmu_context (struct mm_struct *mm)
+{
+	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
+	struct task_struct *tsk;
+	int i;
+
+	if (ia64_ctx.next > max_ctx)
+		ia64_ctx.next = 300;	/* skip daemons */
+	ia64_ctx.limit = max_ctx + 1;
+
+	/*
+	 * Scan all the task's mm->context and set proper safe range
+	 */
+
+	read_lock(&tasklist_lock);
+  repeat:
+	for_each_process(tsk) {
+		if (!tsk->mm)
+			continue;
+		tsk_context = tsk->mm->context;
+		if (tsk_context == ia64_ctx.next) {
+			if (++ia64_ctx.next >= ia64_ctx.limit) {
+				/* empty range: reset the range limit and start over */
+				if (ia64_ctx.next > max_ctx)
+					ia64_ctx.next = 300;
+				ia64_ctx.limit = max_ctx + 1;
+				goto repeat;
+			}
+		}
+		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
+			ia64_ctx.limit = tsk_context;
+	}
+	read_unlock(&tasklist_lock);
+	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
+	{
+		int cpu = get_cpu(); /* prevent preemption/migration */
+		for (i = 0; i < NR_CPUS; ++i)
+			if (cpu_online(i) && (i != cpu))
+				per_cpu(ia64_need_tlb_flush, i) = 1;
+		put_cpu();
+	}
+	local_flush_tlb_all();
+}
+
+void
+ia64_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbits)
+{
+	static DEFINE_SPINLOCK(ptcg_lock);
+
+	/* HW requires global serialization of ptc.ga.  */
+	spin_lock(&ptcg_lock);
+	{
+		do {
+			/*
+			 * Flush ALAT entries also.
+			 */
+			ia64_ptcga(start, (nbits<<2));
+			ia64_srlz_i();
+			start += (1UL << nbits);
+		} while (start < end);
+	}
+	spin_unlock(&ptcg_lock);
+}
+
+void
+local_flush_tlb_all (void)
+{
+	unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
+
+	addr    = local_cpu_data->ptce_base;
+	count0  = local_cpu_data->ptce_count[0];
+	count1  = local_cpu_data->ptce_count[1];
+	stride0 = local_cpu_data->ptce_stride[0];
+	stride1 = local_cpu_data->ptce_stride[1];
+
+	local_irq_save(flags);
+	for (i = 0; i < count0; ++i) {
+		for (j = 0; j < count1; ++j) {
+			ia64_ptce(addr);
+			addr += stride1;
+		}
+		addr += stride0;
+	}
+	local_irq_restore(flags);
+	ia64_srlz_i();			/* srlz.i implies srlz.d */
+}
+
+void
+flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long size = end - start;
+	unsigned long nbits;
+
+	if (mm != current->active_mm) {
+		/* this does happen, but perhaps it's not worth optimizing for? */
+#ifdef CONFIG_SMP
+		flush_tlb_all();
+#else
+		mm->context = 0;
+#endif
+		return;
+	}
+
+	nbits = ia64_fls(size + 0xfff);
+	while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
+		++nbits;
+	if (nbits > purge.max_bits)
+		nbits = purge.max_bits;
+	start &= ~((1UL << nbits) - 1);
+
+# ifdef CONFIG_SMP
+	platform_global_tlb_purge(start, end, nbits);
+# else
+	do {
+		ia64_ptcl(start, (nbits<<2));
+		start += (1UL << nbits);
+	} while (start < end);
+# endif
+
+	ia64_srlz_i();			/* srlz.i implies srlz.d */
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void __devinit
+ia64_tlb_init (void)
+{
+	ia64_ptce_info_t ptce_info;
+	unsigned long tr_pgbits;
+	long status;
+
+	if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
+		printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;"
+		       "defaulting to architected purge page-sizes.\n", status);
+		purge.mask = 0x115557000UL;
+	}
+	purge.max_bits = ia64_fls(purge.mask);
+
+	ia64_get_ptce(&ptce_info);
+	local_cpu_data->ptce_base = ptce_info.base;
+	local_cpu_data->ptce_count[0] = ptce_info.count[0];
+	local_cpu_data->ptce_count[1] = ptce_info.count[1];
+	local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
+	local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
+
+	local_flush_tlb_all();		/* nuke left overs from bootstrapping... */
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/mm
download	linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2