From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: usermodehelper: Tidy up waiting

Rather than using a tri-state integer for the wait flag in
call_usermodehelper_exec, define a proper enum, and use that.  I've
preserved the integer values so that any callers I've missed should
still work OK.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Howells <dhowells@redhat.com>
---
 arch/i386/mach-voyager/voyager_thread.c | 2 +-
 arch/x86_64/kernel/mce.c                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d15991794..f3fb8174559e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 	if (events != atomic_read(&mce_logged) && trigger[0]) {
 		/* Small race window, but should be harmless.  */
 		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 	}
 }
 
-- 
cgit v1.2.3


From 810bab448e563ffd1718d78e9a3756806b626acc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: use elfnote.h to generate vsyscall notes.

Use existing elfnote.h to generate vsyscall notes, rather than doing
it locally.  Changes elfnote.h a bit to suit, since this is the first
asm user, and it wasn't quite right.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.com>
---
 arch/i386/kernel/vsyscall-note.S | 23 ++++++-----------------
 include/linux/elfnote.h          | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..52e0cbbac70c 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,12 @@
  * Here we can supply some information useful to userland.
  */
 
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
 
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
-	.section name, flags;						      \
-	.balign 4;							      \
-	.long 1f - 0f;		/* name length */			      \
-	.long 3f - 2f;		/* data length */			      \
-	.long type;		/* note type */				      \
-0:	.asciz vendor;		/* vendor name */			      \
-1:	.balign 4;							      \
-2:
-
-#define ASM_ELF_NOTE_END						      \
-3:	.balign 4;		/* pad out section */			      \
-	.previous
-
-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
 	.long LINUX_VERSION_CODE
-	ASM_ELF_NOTE_END
+ELFNOTE_END
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 9a1e0674e56c..e831759b2fb5 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
  * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
  *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
  */
-#define ELFNOTE(name, type, desctype, descdata)	\
-.pushsection .note.name, "",@note	;	\
-  .align 4				;	\
+#define ELFNOTE_START(name, type, flags)	\
+.pushsection .note.name, flags,@note	;	\
+  .balign 4				;	\
   .long 2f - 1f		/* namesz */	;	\
-  .long 4f - 3f		/* descsz */	;	\
+  .long 4484f - 3f	/* descsz */	;	\
   .long type				;	\
 1:.asciz #name				;	\
-2:.align 4				;	\
-3:desctype descdata			;	\
-4:.align 4				;	\
+2:.balign 4				;	\
+3:
+
+#define ELFNOTE_END				\
+4484:.balign 4				;	\
 .popsection				;
+
+#define ELFNOTE(name, type, desc)		\
+	ELFNOTE_START(name, type, "")		\
+		desc			;	\
+	ELFNOTE_END
+
 #else	/* !__ASSEMBLER__ */
 #include <linux/elf.h>
 /*
-- 
cgit v1.2.3


From fdb4c338c8d1d494e17c3422a3ea2129f6791596 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: paravirt: add an "mm" argument to alloc_pt

It's useful to know which mm is allocating a pagetable.  Xen uses this
to determine whether the pagetable being added to is pinned or not.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/kernel/vmi.c      | 2 +-
 arch/i386/mm/init.c         | 2 +-
 arch/i386/mm/pageattr.c     | 2 +-
 include/asm-i386/paravirt.h | 6 +++---
 include/asm-i386/pgalloc.h  | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc5..234bd6ff518d 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
-static void vmi_allocate_pt(u32 pfn)
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
 {
 	vmi_set_page_type(pfn, VMI_PAGE_L1);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d3663..f9b6a8878547 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
+		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
 	}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9c..37992ffb1633 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
-	paravirt_alloc_pt(page_to_pfn(base));
+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                           addr == address ? prot : ref_prot));
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 7f846a7d6bcc..99bf661a65f2 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -173,7 +173,7 @@ struct paravirt_ops
 				 unsigned long va);
 
 	/* Hooks for allocating/releasing pagetable pages */
-	void (*alloc_pt)(u32 pfn);
+	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pd)(u32 pfn);
 	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
 	void (*release_pt)(u32 pfn);
@@ -725,9 +725,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
 }
 
-static inline void paravirt_alloc_pt(unsigned pfn)
+static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
 {
-	PVOP_VCALL1(alloc_pt, pfn);
+	PVOP_VCALL2(alloc_pt, mm, pfn);
 }
 static inline void paravirt_release_pt(unsigned pfn)
 {
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index d07b7afc2692..f2fc33ceb9f2 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -7,7 +7,7 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define paravirt_alloc_pt(pfn) do { } while (0)
+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
 #define paravirt_alloc_pd(pfn) do { } while (0)
 #define paravirt_alloc_pd(pfn) do { } while (0)
 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
@@ -17,13 +17,13 @@
 
 #define pmd_populate_kernel(mm, pmd, pte)			\
 do {								\
-	paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT);		\
+	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);		\
 	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));		\
 } while (0)
 
 #define pmd_populate(mm, pmd, pte) 				\
 do {								\
-	paravirt_alloc_pt(page_to_pfn(pte));			\
+	paravirt_alloc_pt(mm, page_to_pfn(pte));		\
 	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
 		((unsigned long long)page_to_pfn(pte) <<	\
 			(unsigned long long) PAGE_SHIFT)));	\
-- 
cgit v1.2.3


From 6996d3b63fd9a64341bc80dad1b556fd3eb81272 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: paravirt: add a hook for once the allocator is ready

Add a hook so that the paravirt backend knows when the allocator is
ready.  This is useful for the obvious reason that the allocator is
available, but the other side-effect of having the bootmem allocator
available is that each page now has an associated "struct page".

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/kernel/setup.c    | 2 ++
 include/asm-i386/paravirt.h | 8 ++++++++
 include/asm-i386/setup.h    | 4 ++++
 3 files changed, 14 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 2d61e65eeb50..74871d066c2b 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
+	paravirt_post_allocator_init();
+
 	dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 99bf661a65f2..786856950b1a 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -52,6 +52,8 @@ struct paravirt_ops
 	/* Basic arch-specific setup */
 	void (*arch_setup)(void);
 	char *(*memory_setup)(void);
+	void (*post_allocator_init)(void);
+
 	void (*init_IRQ)(void);
 	void (*time_init)(void);
 
@@ -669,6 +671,12 @@ static inline void setup_secondary_clock(void)
 }
 #endif
 
+static inline void paravirt_post_allocator_init(void)
+{
+	if (paravirt_ops.post_allocator_init)
+		(*paravirt_ops.post_allocator_init)();
+}
+
 static inline void paravirt_pagetable_setup_start(pgd_t *base)
 {
 	if (paravirt_ops.pagetable_setup_start)
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 0d5bff9dc4a5..7862fe858a9e 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start,
 
 extern unsigned long init_pg_tables_end;
 
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init()	do {} while (0)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif  /*  __KERNEL__  */
-- 
cgit v1.2.3


From 53787013248f52af81d99f63454e5a5cf34d6f12 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: paravirt: unstatic leave_mm

Make globally leave_mm visible, specifically so that Xen can use it to
shoot-down lazy uses of cr3.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/smp.c         | 5 +++--
 include/asm-i386/mmu_context.h | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e2..2d35d8502029 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
 #include <mach_apic.h>
 
 /*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
 
 /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  *
  * We need to reload %cr3 since the page tables may be going
  * away from under us..
  */
-static inline void leave_mm (unsigned long cpu)
+void leave_mm(unsigned long cpu)
 {
 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 		BUG();
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 8198d1cca1f3..7eb0b0b1fb3c 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 #endif
 }
 
+void leave_mm(unsigned long cpu);
+
 static inline void switch_mm(struct mm_struct *prev,
 			     struct mm_struct *next,
 			     struct task_struct *tsk)
-- 
cgit v1.2.3


From 724faa89ccd8fae65f3d41a47b0e1034cf07918b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: paravirt: unstatic smp_store_cpu_info

Paravirt implementations need to store cpu info when bringing up cpus.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/kernel/smpboot.c | 2 +-
 include/asm-i386/smp.h     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8e..26752931023a 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
  * a given CPU
  */
 
-static void __cpuinit smp_store_cpu_info(int id)
+void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = cpu_data + id;
 
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 0c7132787062..0f54f44b4720 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -129,6 +129,8 @@ extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern unsigned int num_processors;
 
+void __cpuinit smp_store_cpu_info(int id);
+
 #endif /* !__ASSEMBLY__ */
 
 #else /* CONFIG_SMP */
-- 
cgit v1.2.3


From c70df74376c1e29a04e07e23dd3f4c384d6166dd Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: paravirt: make siblingmap functions visible

Paravirt implementations need to set the sibling map on new cpus.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/kernel/smpboot.c | 6 ++----
 include/asm-i386/smp.h     | 3 +++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 26752931023a..5910d3fac561 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
-static inline void
-set_cpu_sibling_map(int cpu)
+void set_cpu_sibling_map(int cpu)
 {
 	int i;
 	struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void
-remove_siblinginfo(int cpu)
+void remove_siblinginfo(int cpu)
 {
 	int sibling;
 	struct cpuinfo_x86 *c = cpu_data;
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 0f54f44b4720..1f73bde165b1 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[];
 
 #define cpu_physical_id(cpu)	x86_cpu_to_apicid[cpu]
 
+extern void set_cpu_sibling_map(int cpu);
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
+extern void remove_siblinginfo(int cpu);
 #endif
 
 struct smp_ops
-- 
cgit v1.2.3


From bdef40a6af64a0140a65df49bf504124d57094a9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: paravirt: export __supported_pte_mask

__supported_pte_mask is needed when constructing pte values.  Xen
device drivers need to do this to make mappings of foreign pages (ie,
pages granted to us by other domains).

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/mm/init.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index f9b6a8878547..6a68b1ae061c 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -473,6 +473,7 @@ void zap_low_mappings (void)
 
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /*
  * noexec = on|off
-- 
cgit v1.2.3


From d572929cdd12a60732c3522f7cf011bfa29165cf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: paravirt: helper to disable all IO space

In a virtual environment, device drivers such as legacy IDE will waste
quite a lot of time probing for their devices which will never appear.
This helper function allows a paravirt implementation to lay claim to
the whole iomem and ioport space, thereby disabling all device drivers
trying to claim IO resources.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/kernel/paravirt.c | 35 +++++++++++++++++++++++++++++++++++
 include/asm-i386/paravirt.h |  1 +
 2 files changed, 36 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5e..60e08b9b50a4 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -228,6 +228,41 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
 
+static struct resource reserve_ioports = {
+	.start = 0,
+	.end = IO_SPACE_LIMIT,
+	.name = "paravirt-ioport",
+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+	.start = 0,
+	.end = -1,
+	.name = "paravirt-iomem",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+	int ret;
+
+	ret = request_resource(&ioport_resource, &reserve_ioports);
+	if (ret == 0) {
+		ret = request_resource(&iomem_resource, &reserve_iomem);
+		if (ret)
+			release_resource(&reserve_ioports);
+	}
+
+	return ret;
+}
+
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 786856950b1a..690ada22437a 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -262,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
 unsigned paravirt_patch_insns(void *site, unsigned len,
 			      const char *start, const char *end);
 
+int paravirt_disable_iospace(void);
 
 /*
  * This generates an indirect call based on the operation type number.
-- 
cgit v1.2.3


From 688340ea34c61ad12473ccd837325b59aada9a93 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: Add a sched_clock paravirt_op

The tsc-based get_scheduled_cycles interface is not a good match for
Xen's runstate accounting, which reports everything in nanoseconds.

This patch replaces this interface with a sched_clock interface, which
matches both Xen and VMI's requirements.

In order to do this, we:
   1. replace get_scheduled_cycles with sched_clock
   2. hoist cycles_2_ns into a common header
   3. update vmi accordingly

One thing to note: because sched_clock is implemented as a weak
function in kernel/sched.c, we must define a real function in order to
override this weak binding.  This means the usual paravirt_ops
technique of using an inline function won't work in this case.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Cc: john stultz <johnstul@us.ibm.com>
---
 arch/i386/kernel/paravirt.c |  2 +-
 arch/i386/kernel/tsc.c      | 23 +++++++++++++++--------
 arch/i386/kernel/vmi.c      |  2 +-
 arch/i386/kernel/vmiclock.c |  6 +++---
 include/asm-i386/paravirt.h |  7 +++++--
 include/asm-i386/timer.h    | 32 +++++++++++++++++++++++++++++++-
 include/asm-i386/vmi_time.h |  2 +-
 7 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 60e08b9b50a4..53f07a8275e3 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -302,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.get_scheduled_cycles = native_read_tsc,
+	.sched_clock = native_sched_clock,
 	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e8..252f9010f283 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
 	unsigned long long this_offset;
 
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
 	/* read the Time Stamp Counter: */
-	get_scheduled_cycles(this_offset);
+	rdtscll(this_offset);
 
 	/* return the value in ns */
 	return cycles_2_ns(this_offset);
 }
 
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("native_sched_clock")));
+#endif
+
 unsigned long native_calculate_cpu_khz(void)
 {
 	unsigned long long start, end;
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 234bd6ff518d..72042bb7ec94 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
 		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
 		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+		paravirt_ops.sched_clock = vmi_sched_clock;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a8762..f9b845f4e692 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
 	return 0;
 }
 
-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
-unsigned long long vmi_get_sched_cycles(void)
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
 {
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 690ada22437a..7df88be2dd9e 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -118,7 +118,7 @@ struct paravirt_ops
 
 	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(void);
- 	u64 (*get_scheduled_cycles)(void);
+	unsigned long long (*sched_clock)(void);
 	unsigned long (*get_cpu_khz)(void);
 
 	/* Segment descriptor handling */
@@ -566,7 +566,10 @@ static inline u64 paravirt_read_tsc(void)
 
 #define rdtscll(val) (val = paravirt_read_tsc())
 
-#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
+static inline unsigned long long paravirt_sched_clock(void)
+{
+	return PVOP_CALL0(unsigned long long, sched_clock);
+}
 #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
 
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 153770e25faa..51a713e33a9e 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -15,8 +15,38 @@ extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 
 #ifndef CONFIG_PARAVIRT
-#define get_scheduled_cycles(val) rdtscll(val)
 #define calculate_cpu_khz() native_calculate_cpu_khz()
 #endif
 
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+extern unsigned long cyc2ns_scale __read_mostly;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+
 #endif
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h
index 213930b995cb..478188130328 100644
--- a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -49,7 +49,7 @@ extern struct vmi_timer_ops {
 extern void __init vmi_time_init(void);
 extern unsigned long vmi_get_wallclock(void);
 extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_get_sched_cycles(void);
+extern unsigned long long vmi_sched_clock(void);
 extern unsigned long vmi_cpu_khz(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v1.2.3


From 24037a8b69dbf15bfed8fd42a2a2e442d7b0395b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: Add nosegneg capability to the vsyscall page notes

Add the "nosegneg" fake capabilty to the vsyscall page notes. This is
used by the runtime linker to select a glibc version which then
disables negative-offset accesses to the thread-local segment via
%gs. These accesses require emulation in Xen (because segments are
truncated to protect the hypervisor address space) and avoiding them
provides a measurable performance boost.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Zachary Amsden <zach@vmware.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
---
 arch/i386/kernel/vsyscall-note.S | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index 52e0cbbac70c..271f16a8ca01 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -12,3 +12,31 @@
 ELFNOTE_START(Linux, 0, "a")
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
+
+#ifdef CONFIG_XEN
+
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ */
+
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT	1
+
+ELFNOTE_START(GNU, 2, "a")
+	.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT		/* ncaps, mask */
+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
+ELFNOTE_END
+#endif
-- 
cgit v1.2.3


From 5ead97c84fa7d63a6a7a2f4e9f18f452bd109045 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: xen: Core Xen implementation

This patch is a rollup of all the core pieces of the Xen
implementation, including:
 - booting and setup
 - pagetable setup
 - privileged instructions
 - segmentation
 - interrupt flags
 - upcalls
 - multicall batching

BOOTING AND SETUP

The vmlinux image is decorated with ELF notes which tell the Xen
domain builder what the kernel's requirements are; the domain builder
then constructs the address space accordingly and starts the kernel.

Xen has its own entrypoint for the kernel (contained in an ELF note).
The ELF notes are set up by xen-head.S, which is included into head.S.
In principle it could be linked separately, but it seems to provoke
lots of binutils bugs.

Because the domain builder starts the kernel in a fairly sane state
(32-bit protected mode, paging enabled, flat segments set up), there's
not a lot of setup needed before starting the kernel proper.  The main
steps are:
  1. Install the Xen paravirt_ops, which is simply a matter of a
     structure assignment.
  2. Set init_mm to use the Xen-supplied pagetables (analogous to the
     head.S generated pagetables in a native boot).
  3. Reserve address space for Xen, since it takes a chunk at the top
     of the address space for its own use.
  4. Call start_kernel()

PAGETABLE SETUP

Once we hit the main kernel boot sequence, it will end up calling back
via paravirt_ops to set up various pieces of Xen specific state.  One
of the critical things which requires a bit of extra care is the
construction of the initial init_mm pagetable.  Because Xen places
tight constraints on pagetables (an active pagetable must always be
valid, and must always be mapped read-only to the guest domain), we
need to be careful when constructing the new pagetable to keep these
constraints in mind.  It turns out that the easiest way to do this is
use the initial Xen-provided pagetable as a template, and then just
insert new mappings for memory where a mapping doesn't already exist.

This means that during pagetable setup, it uses a special version of
xen_set_pte which ignores any attempt to remap a read-only page as
read-write (since Xen will map its own initial pagetable as RO), but
lets other changes to the ptes happen, so that things like NX are set
properly.

PRIVILEGED INSTRUCTIONS AND SEGMENTATION

When the kernel runs under Xen, it runs in ring 1 rather than ring 0.
This means that it is more privileged than user-mode in ring 3, but it
still can't run privileged instructions directly.  Non-performance
critical instructions are dealt with by taking a privilege exception
and trapping into the hypervisor and emulating the instruction, but
more performance-critical instructions have their own specific
paravirt_ops.  In many cases we can avoid having to do any hypercalls
for these instructions, or the Xen implementation is quite different
from the normal native version.

The privileged instructions fall into the broad classes of:
  Segmentation: setting up the GDT and the GDT entries, LDT,
     TLS and so on.  Xen doesn't allow the GDT to be directly
     modified; all GDT updates are done via hypercalls where the new
     entries can be validated.  This is important because Xen uses
     segment limits to prevent the guest kernel from damaging the
     hypervisor itself.
  Traps and exceptions: Xen uses a special format for trap entrypoints,
     so when the kernel wants to set an IDT entry, it needs to be
     converted to the form Xen expects.  Xen sets int 0x80 up specially
     so that the trap goes straight from userspace into the guest kernel
     without going via the hypervisor.  sysenter isn't supported.
  Kernel stack: The esp0 entry is extracted from the tss and provided to
     Xen.
  TLB operations: the various TLB calls are mapped into corresponding
     Xen hypercalls.
  Control registers: all the control registers are privileged.  The most
     important is cr3, which points to the base of the current pagetable,
     and we handle it specially.

Another instruction we treat specially is CPUID, even though its not
privileged.  We want to control what CPU features are visible to the
rest of the kernel, and so CPUID ends up going into a paravirt_op.
Xen implements this mainly to disable the ACPI and APIC subsystems.

INTERRUPT FLAGS

Xen maintains its own separate flag for masking events, which is
contained within the per-cpu vcpu_info structure.  Because the guest
kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely
ignored (and must be, because even if a guest domain disables
interrupts for itself, it can't disable them overall).

(A note on terminology: "events" and interrupts are effectively
synonymous.  However, rather than using an "enable flag", Xen uses a
"mask flag", which blocks event delivery when it is non-zero.)

There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which
are implemented to manage the Xen event mask state.  The only thing
worth noting is that when events are unmasked, we need to explicitly
see if there's a pending event and call into the hypervisor to make
sure it gets delivered.

UPCALLS

Xen needs a couple of upcall (or callback) functions to be implemented
by each guest.  One is the event upcalls, which is how events
(interrupts, effectively) are delivered to the guests.  The other is
the failsafe callback, which is used to report errors in either
reloading a segment register, or caused by iret.  These are
implemented in i386/kernel/entry.S so they can jump into the normal
iret_exc path when necessary.

MULTICALL BATCHING

Xen provides a multicall mechanism, which allows multiple hypercalls
to be issued at once in order to mitigate the cost of trapping into
the hypervisor.  This is particularly useful for context switches,
since the 4-5 hypercalls they would normally need (reload cr3, update
TLS, maybe update LDT) can be reduced to one.  This patch implements a
generic batching mechanism for hypercalls, which gets used in many
places in the Xen code.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Ian Pratt <ian.pratt@xensource.com>
Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Cc: Adrian Bunk <bunk@stusta.de>
---
 arch/i386/Makefile               |   3 +
 arch/i386/kernel/entry.S         |  71 ++++
 arch/i386/kernel/head.S          |   5 +-
 arch/i386/kernel/vmlinux.lds.S   |   1 +
 arch/i386/xen/Makefile           |   1 +
 arch/i386/xen/enlighten.c        | 745 +++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/features.c         |  29 ++
 arch/i386/xen/multicalls.c       |  89 +++++
 arch/i386/xen/multicalls.h       |  45 +++
 arch/i386/xen/setup.c            |  97 +++++
 arch/i386/xen/xen-head.S         |  36 ++
 arch/i386/xen/xen-ops.h          |  31 ++
 include/asm-i386/irq.h           |   1 +
 include/asm-i386/xen/hypercall.h |  18 +
 include/xen/features.h           |  23 ++
 include/xen/page.h               | 179 ++++++++++
 16 files changed, 1373 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/xen/Makefile
 create mode 100644 arch/i386/xen/enlighten.c
 create mode 100644 arch/i386/xen/features.c
 create mode 100644 arch/i386/xen/multicalls.c
 create mode 100644 arch/i386/xen/multicalls.h
 create mode 100644 arch/i386/xen/setup.c
 create mode 100644 arch/i386/xen/xen-head.S
 create mode 100644 arch/i386/xen/xen-ops.h
 create mode 100644 include/xen/features.h
 create mode 100644 include/xen/page.h

(limited to 'arch')

diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 181cc29a7c4f..01f0ff0daaf4 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-i386/mach-es7000
 mcore-$(CONFIG_X86_ES7000)	:= mach-default
 core-$(CONFIG_X86_ES7000)	:= arch/i386/mach-es7000/
 
+# Xen paravirtualization support
+core-$(CONFIG_XEN)		+= arch/i386/xen/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c9..ffb236544270 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1023,6 +1023,77 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	CFI_STARTPROC
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	mov %esp, %eax
+	call xen_evtchn_do_upcall
+	jmp  ret_from_intr
+	CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	lea 16(%esp),%esp
+	CFI_ADJUST_CFA_OFFSET -16
+	jz 5f
+	addl $16,%esp
+	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
+5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+.section .fixup,"ax"
+6:	xorl %eax,%eax
+	movl %eax,4(%esp)
+	jmp 1b
+7:	xorl %eax,%eax
+	movl %eax,8(%esp)
+	jmp 2b
+8:	xorl %eax,%eax
+	movl %eax,12(%esp)
+	jmp 3b
+9:	xorl %eax,%eax
+	movl %eax,16(%esp)
+	jmp 4b
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,6b
+	.long 2b,7b
+	.long 3b,8b
+	.long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif	/* CONFIG_XEN */
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 82714668d43b..7c52b222207e 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,7 +510,8 @@ ENTRY(_stext)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned","wa"
+	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
 	.fill 1024,4,0
 ENTRY(swapper_pg_pmd)
@@ -538,6 +539,8 @@ fault_msg:
 	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
 	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
 
+#include "../xen/xen-head.S"
+
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
  * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c82..00f1bc47d3a2 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -88,6 +88,7 @@ SECTIONS
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
 	*(.data.idt)
   }
 
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..60bc1cfb101c
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1 @@
+obj-y		:= enlighten.o setup.o features.o multicalls.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..2d484f9320de
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,745 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+
+#include "xen-ops.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static void xen_vcpu_setup(int cpu)
+{
+	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+}
+
+static void __init xen_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	unsigned maskedx = ~0;
+
+	/*
+	 * Mask out inconvenient features, to try and disable as many
+	 * unsupported kernel subsystems as possible.
+	 */
+	if (*eax == 1)
+		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	asm(XEN_EMULATE_PREFIX "cpuid"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+	*edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+	HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+	return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+	struct vcpu_info *vcpu;
+	unsigned long flags;
+
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	/* flag has opposite sense of mask */
+	flags = !vcpu->evtchn_upcall_mask;
+	preempt_enable();
+
+	/* convert to IF type flag
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+	struct vcpu_info *vcpu;
+
+	preempt_disable();
+
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = flags;
+
+	if (flags == 0) {
+		/* Unmask then check (avoid races).  We're only protecting
+		   against updates by this CPU, so there's no need for
+		   anything stronger. */
+		barrier();
+
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			force_evtchn_callback();
+		preempt_enable();
+	} else
+		preempt_enable_no_resched();
+}
+
+static void xen_irq_disable(void)
+{
+	struct vcpu_info *vcpu;
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = 1;
+	preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+	struct vcpu_info *vcpu;
+
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = 0;
+
+	/* Unmask then check (avoid races).  We're only protecting
+	   against updates by this CPU, so there's no need for
+	   anything stronger. */
+	barrier();
+
+	if (unlikely(vcpu->evtchn_upcall_pending))
+		force_evtchn_callback();
+	preempt_enable();
+}
+
+static void xen_safe_halt(void)
+{
+	/* Blocking includes an implicit local_irq_enable(). */
+	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+		BUG();
+}
+
+static void xen_halt(void)
+{
+	if (irqs_disabled())
+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+	else
+		xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	switch (mode) {
+	case PARAVIRT_LAZY_NONE:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_MMU:
+	case PARAVIRT_LAZY_CPU:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_FLUSH:
+		/* flush if necessary, but don't change state */
+		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+			xen_mc_flush();
+		return;
+	}
+
+	xen_mc_flush();
+	x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+	return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned long linear_addr = (unsigned long)addr;
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_SET_LDT;
+	if (linear_addr) {
+		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
+		xmaddr_t maddr;
+		maddr = arbitrary_virt_to_machine((unsigned long)addr);
+		linear_addr = (unsigned long)maddr.maddr;
+	}
+	op->arg1.linear_addr = linear_addr;
+	op->arg2.nr_ents = entries;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	unsigned long *frames;
+	unsigned long va = dtr->address;
+	unsigned int size = dtr->size + 1;
+	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	int f;
+	struct multicall_space mcs;
+
+	/* A GDT can be up to 64k in size, which corresponds to 8192
+	   8-byte entries, or 16 4k pages.. */
+
+	BUG_ON(size > 65536);
+	BUG_ON(va & ~PAGE_MASK);
+
+	mcs = xen_mc_entry(sizeof(*frames) * pages);
+	frames = mcs.args;
+
+	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+		frames[f] = virt_to_mfn(va);
+		make_lowmem_page_readonly((void *)va);
+	}
+
+	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+				unsigned int cpu, unsigned int i)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	struct multicall_space mc = __xen_mc_entry(0);
+
+	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	xen_mc_batch();
+
+	load_TLS_descriptor(t, cpu, 0);
+	load_TLS_descriptor(t, cpu, 1);
+	load_TLS_descriptor(t, cpu, 2);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+	unsigned long lp = (unsigned long)&dt[entrynum];
+	xmaddr_t mach_lp = virt_to_machine(lp);
+	u64 entry = (u64)high << 32 | low;
+
+	xen_mc_flush();
+	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+		BUG();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+			    struct trap_info *info)
+{
+	u8 type, dpl;
+
+	type = (high >> 8) & 0x1f;
+	dpl = (high >> 13) & 3;
+
+	if (type != 0xf && type != 0xe)
+		return 0;
+
+	info->vector = vector;
+	info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+	info->cs = low >> 16;
+	info->flags = dpl;
+	/* interrupt gates clear IF */
+	if (type == 0xe)
+		info->flags |= 4;
+
+	return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+
+	int cpu = smp_processor_id();
+	unsigned long p = (unsigned long)&dt[entrynum];
+	unsigned long start = per_cpu(idt_desc, cpu).address;
+	unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+
+	xen_mc_flush();
+
+	write_dt_entry(dt, entrynum, low, high);
+
+	if (p >= start && (p + 8) <= end) {
+		struct trap_info info[2];
+
+		info[1].address = 0;
+
+		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+			if (HYPERVISOR_set_trap_table(info))
+				BUG();
+	}
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+	static DEFINE_SPINLOCK(lock);
+	static struct trap_info traps[257];
+
+	int cpu = smp_processor_id();
+	unsigned in, out, count;
+
+	per_cpu(idt_desc, cpu) = *desc;
+
+	count = (desc->size+1) / 8;
+	BUG_ON(count > 256);
+
+	spin_lock(&lock);
+	for (in = out = 0; in < count; in++) {
+		const u32 *entry = (u32 *)(desc->address + in * 8);
+
+		if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+			out++;
+	}
+	traps[out].address = 0;
+
+	xen_mc_flush();
+	if (HYPERVISOR_set_trap_table(traps))
+		BUG();
+
+	spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+				u32 low, u32 high)
+{
+	switch ((high >> 8) & 0xff) {
+	case DESCTYPE_LDT:
+	case DESCTYPE_TSS:
+		/* ignore */
+		break;
+
+	default: {
+		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		u64 desc = (u64)high << 32 | low;
+
+		xen_mc_flush();
+		if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+			BUG();
+	}
+
+	}
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	struct multicall_space mcs = xen_mc_entry(0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+	struct mmuext_op op;
+
+	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+	struct mmuext_op op;
+
+	op.cmd = MMUEXT_INVLPG_LOCAL;
+	op.arg1.linear_addr = addr & PAGE_MASK;
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
+static unsigned long xen_read_cr2(void)
+{
+	return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+	/* never allow TSC to be disabled */
+	native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+static unsigned long xen_read_cr3(void)
+{
+	return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+	if (cr3 == x86_read_percpu(xen_cr3)) {
+		/* just a simple tlb flush */
+		xen_flush_tlb();
+		return;
+	}
+
+	x86_write_percpu(xen_cr3, cr3);
+
+
+	{
+		struct mmuext_op *op;
+		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+		op = mcs.args;
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = mfn;
+
+		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	}
+}
+
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+	/* XXX pfn isn't necessarily a lowmem page */
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static void xen_alloc_pd(u32 pfn)
+{
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static void xen_release_pd(u32 pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void xen_release_pt(u32 pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn,
+					u32 start, u32 count)
+{
+	xen_alloc_pd(pfn);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_mm.pgd = base;
+	/*
+	 * copy top-level of Xen-supplied pagetable into place.	 For
+	 * !PAE we can use this as-is, but for PAE it is a stand-in
+	 * while we copy the pmd pages.
+	 */
+	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+	if (PTRS_PER_PMD > 1) {
+		int i;
+		/*
+		 * For PAE, need to allocate new pmds, rather than
+		 * share Xen's, since Xen doesn't like pmd's being
+		 * shared between address spaces.
+		 */
+		for (i = 0; i < PTRS_PER_PGD; i++) {
+			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+				       PAGE_SIZE);
+
+				xen_alloc_pd(PFN_DOWN(__pa(pmd)));
+
+				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+			} else
+				pgd_clear(&base[i]);
+		}
+	}
+
+	/* make sure zero_page is mapped RO so we can use it in pagetables */
+	make_lowmem_page_readonly(empty_zero_page);
+	make_lowmem_page_readonly(base);
+	/*
+	 * Switch to new pagetable.  This is done before
+	 * pagetable_init has done anything so that the new pages
+	 * added to the table can be prepared properly for Xen.
+	 */
+	xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/*
+		 * Create a mapping for the shared info page.
+		 * Should be set_fixmap(), but shared_info is a machine
+		 * address with no corresponding pseudo-phys address.
+		 */
+#if 0
+		set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+			    PFN_DOWN(xen_start_info->shared_info),
+			    PAGE_KERNEL);
+#endif
+
+		HYPERVISOR_shared_info =
+			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+	} else
+		HYPERVISOR_shared_info =
+			(struct shared_info *)__va(xen_start_info->shared_info);
+
+#if 0
+	xen_pgd_pin(base);
+#endif
+
+	xen_vcpu_setup(smp_processor_id());
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+	.paravirt_enabled = 1,
+	.shared_kernel_pmd = 0,
+
+	.name = "Xen",
+	.banner = xen_banner,
+
+	.patch = paravirt_patch_default,
+
+	.memory_setup = xen_memory_setup,
+	.arch_setup = xen_arch_setup,
+
+	.cpuid = xen_cpuid,
+
+	.set_debugreg = xen_set_debugreg,
+	.get_debugreg = xen_get_debugreg,
+
+	.clts = native_clts,
+
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = native_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = xen_write_cr4,
+
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+	.wbinvd = native_wbinvd,
+
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+
+	.iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+	.irq_enable_sysexit = NULL,  /* never called */
+
+	.load_tr_desc = paravirt_nop,
+	.set_ldt = xen_set_ldt,
+	.load_gdt = xen_load_gdt,
+	.load_idt = xen_load_idt,
+	.load_tls = xen_load_tls,
+
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = xen_store_tr,
+
+	.write_ldt_entry = xen_write_ldt_entry,
+	.write_gdt_entry = xen_write_gdt_entry,
+	.write_idt_entry = xen_write_idt_entry,
+	.load_esp0 = xen_load_esp0,
+
+	.set_iopl_mask = xen_set_iopl_mask,
+	.io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = paravirt_nop,
+	.apic_write_atomic = paravirt_nop,
+	.apic_read = xen_apic_read,
+	.setup_boot_clock = paravirt_nop,
+	.setup_secondary_clock = paravirt_nop,
+	.startup_ipi_hook = paravirt_nop,
+#endif
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb,
+	.flush_tlb_single = xen_flush_tlb_single,
+
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.alloc_pt = xen_alloc_pt,
+	.alloc_pd = xen_alloc_pd,
+	.alloc_pd_clone = xen_alloc_pd_clone,
+	.release_pd = xen_release_pd,
+	.release_pt = xen_release_pt,
+
+	.set_lazy_mode = xen_set_lazy_mode,
+};
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+	pgd_t *pgd;
+
+	if (!xen_start_info)
+		return;
+
+	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+	/* Install Xen paravirt ops */
+	paravirt_ops = xen_paravirt_ops;
+
+	xen_setup_features();
+
+	/* Get mfn list */
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+	pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+	init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+	/* keep using Xen gdt for now; no urgent need to change it */
+
+	x86_write_percpu(xen_cr3, __pa(pgd));
+	xen_vcpu_setup(0);
+
+	paravirt_ops.kernel_rpl = 1;
+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+		paravirt_ops.kernel_rpl = 0;
+
+	/* set the limit of our address space */
+	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+	/* set up basic CPUID stuff */
+	cpu_detect(&new_cpu_data);
+	new_cpu_data.hard_math = 1;
+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+	/* Poke various useful things into boot_params */
+	LOADER_TYPE = (9 << 4) | 0;
+	INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+	INITRD_SIZE = xen_start_info->mod_len;
+
+	/* Start the world */
+	start_kernel();
+}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j = 0; j < 32; j++)
+			xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+	}
+}
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..869f9833f08f
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,89 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH	8
+#define MC_ARGS		(MC_BATCH * 32 / sizeof(u64))
+
+struct mc_buffer {
+	struct multicall_entry entries[MC_BATCH];
+	u64 args[MC_ARGS];
+	unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	int ret = 0;
+	unsigned long flags;
+
+	/* Disable interrupts in case someone comes in and queues
+	   something in the middle */
+	local_irq_save(flags);
+
+	if (b->mcidx) {
+		int i;
+
+		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+			BUG();
+		for (i = 0; i < b->mcidx; i++)
+			if (b->entries[i].result < 0)
+				ret++;
+		b->mcidx = 0;
+		b->argidx = 0;
+	} else
+		BUG_ON(b->argidx != 0);
+
+	put_cpu_var(mc_buffer);
+	local_irq_restore(flags);
+
+	BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct multicall_space ret;
+	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+	BUG_ON(argspace > MC_ARGS);
+
+	if (b->mcidx == MC_BATCH ||
+	    (b->argidx + argspace) > MC_ARGS)
+		xen_mc_flush();
+
+	ret.mc = &b->entries[b->mcidx];
+	b->mcidx++;
+	ret.args = &b->args[b->argidx];
+	b->argidx += argspace;
+
+	put_cpu_var(mc_buffer);
+
+	return ret;
+}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+	struct multicall_entry *mc;
+	void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+	/* need to disable interrupts until this entry is complete */
+	local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+	xen_mc_batch();
+	return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+	if ((xen_get_lazy_mode() & mode) == 0)
+		xen_mc_flush();
+
+	/* restore flags saved in xen_mc_batch */
+	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..7da93ee612f6
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,97 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+static __initdata struct shared_info init_shared;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = &init_shared;
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+	unsigned long max_pfn = xen_start_info->nr_pages;
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+	return "Xen";
+}
+
+static void xen_idle(void)
+{
+	local_irq_disable();
+
+	if (need_resched())
+		local_irq_enable();
+	else {
+		current_thread_info()->status &= ~TS_POLLING;
+		smp_mb__after_clear_bit();
+		safe_halt();
+		current_thread_info()->status |= TS_POLLING;
+	}
+}
+
+void __init xen_arch_setup(void)
+{
+	struct physdev_set_iopl set_iopl;
+	int rc;
+
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+	if (rc != 0)
+		printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	memcpy(boot_command_line, xen_start_info->cmd_line,
+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+	pm_idle = xen_idle;
+}
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..2998d55a0017
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+	place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+ENTRY(startup_xen)
+	movl %esi,xen_start_info
+	cld
+	movl $(init_thread_union+THREAD_SIZE),%esp
+	jmp xen_start_kernel
+
+.pushsection ".bss.page_aligned"
+	.align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+	.skip 0x1000
+.popsection
+
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+
+#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..79648fe1ab77
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,31 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+#include <linux/clocksource.h>
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+cycle_t xen_clocksource_read(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+	return x86_read_percpu(xen_lazy_mode);
+}
+
+
+#endif /* XEN_OPS_H */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 9e15ce0006eb..36f310632c49 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+unsigned int do_IRQ(struct pt_regs *regs);
 void init_IRQ(void);
 void __init native_init_IRQ(void);
 
diff --git a/include/asm-i386/xen/hypercall.h b/include/asm-i386/xen/hypercall.h
index 53912859708b..bc0ee7d961ca 100644
--- a/include/asm-i386/xen/hypercall.h
+++ b/include/asm-i386/xen/hypercall.h
@@ -392,4 +392,22 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
 	mcl->args[2] = (unsigned long)success_count;
 	mcl->args[3] = domid;
 }
+
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+	mcl->op = __HYPERVISOR_set_gdt;
+	mcl->args[0] = (unsigned long)frames;
+	mcl->args[1] = entries;
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+		   unsigned long ss, unsigned long esp)
+{
+	mcl->op = __HYPERVISOR_stack_switch;
+	mcl->args[0] = ss;
+	mcl->args[1] = esp;
+}
+
 #endif /* __HYPERCALL_H__ */
diff --git a/include/xen/features.h b/include/xen/features.h
new file mode 100644
index 000000000000..27292d4d2a6a
--- /dev/null
+++ b/include/xen/features.h
@@ -0,0 +1,23 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __XEN_FEATURES_H__
+#define __XEN_FEATURES_H__
+
+#include <xen/interface/features.h>
+
+void xen_setup_features(void);
+
+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
+
+static inline int xen_feature(int flag)
+{
+	return xen_features[flag];
+}
+
+#endif /* __ASM_XEN_FEATURES_H__ */
diff --git a/include/xen/page.h b/include/xen/page.h
new file mode 100644
index 000000000000..1df6c1930578
--- /dev/null
+++ b/include/xen/page.h
@@ -0,0 +1,179 @@
+#ifndef __XEN_PAGE_H
+#define __XEN_PAGE_H
+
+#include <linux/pfn.h>
+
+#include <asm/uaccess.h>
+
+#include <xen/features.h>
+
+#ifdef CONFIG_X86_PAE
+/* Xen machine address */
+typedef struct xmaddr {
+	unsigned long long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+	unsigned long long paddr;
+} xpaddr_t;
+#else
+/* Xen machine address */
+typedef struct xmaddr {
+	unsigned long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+	unsigned long paddr;
+} xpaddr_t;
+#endif
+
+#define XMADDR(x)	((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x)	((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY	(~0UL)
+#define FOREIGN_FRAME_BIT	(1UL<<31)
+#define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
+
+extern unsigned long *phys_to_machine_mapping;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return pfn;
+
+	return phys_to_machine_mapping[(unsigned int)(pfn)] &
+		~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 1;
+
+	return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return mfn;
+
+#if 0
+	if (unlikely((mfn >> machine_to_phys_order) != 0))
+		return max_mapnr;
+#endif
+
+	pfn = 0;
+	/*
+	 * The array access can fail (e.g., device space beyond end of RAM).
+	 * In such cases it doesn't matter what we return (we return garbage),
+	 * but we must handle the fault without crashing!
+	 */
+	__get_user(pfn, &machine_to_phys_mapping[mfn]);
+
+	return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+	unsigned offset = phys.paddr & ~PAGE_MASK;
+	return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+	unsigned offset = machine.maddr & ~PAGE_MASK;
+	return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+	extern unsigned long max_mapnr;
+	unsigned long pfn = mfn_to_pfn(mfn);
+	if ((pfn < max_mapnr)
+	    && !xen_feature(XENFEAT_auto_translated_physmap)
+	    && (phys_to_machine_mapping[pfn] != mfn))
+		return max_mapnr; /* force !pfn_valid() */
+	return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+	phys_to_machine_mapping[pfn] = mfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v)		(pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |			\
+		       (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+	pte_t pte;
+
+	pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) |
+		(pgprot_val(pgprot) >> 32);
+	pte.pte_high &= (__supported_pte_mask >> 32);
+	pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
+	pte.pte_low &= __supported_pte_mask;
+
+	return pte;
+}
+
+static inline unsigned long long pte_val_ma(pte_t x)
+{
+	return ((unsigned long long)x.pte_high << 32) | x.pte_low;
+}
+#define pmd_val_ma(v) ((v).pmd)
+#define pud_val_ma(v) ((v).pgd.pgd)
+#define __pte_ma(x)	((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } )
+#define __pmd_ma(x)	((pmd_t) { (x) } )
+#else  /* !X86_PAE */
+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
+#define mfn_pte(pfn, prot)	__pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pte_val_ma(x)	((x).pte_low)
+#define pmd_val_ma(v)	((v).pud.pgd.pgd)
+#define __pte_ma(x)	((pte_t) { (x) } )
+#endif	/* CONFIG_X86_PAE */
+
+#define pgd_val_ma(x)	((x).pgd)
+
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#endif /* __XEN_PAGE_H */
-- 
cgit v1.2.3


From 3b827c1b3aadf3adb4c602d19863f2d24e7cbc18 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: xen: virtual mmu

Xen pagetable handling, including the machinery to implement direct
pagetables.

Xen presents the real CPU's pagetables directly to guests, with no
added shadowing or other layer of abstraction.  Naturally this means
the hypervisor must maintain close control over what the guest can put
into the pagetable.

When the guest modifies the pte/pmd/pgd, it must convert its
domain-specific notion of a "physical" pfn into a global machine frame
number (mfn) before inserting the entry into the pagetable.  Xen will
check to make sure the domain is allowed to create a mapping of the
given mfn.

Xen also requires that all mappings the guest has of its own active
pagetable are read-only.  This is relatively easy to implement in
Linux because all pagetables share the same pte pages for kernel
mappings, so updating the pte in one pagetable will implicitly update
the mapping in all pagetables.

Normally a pagetable becomes active when you point to it with cr3 (or
the Xen equivalent), but when you do so, Xen must check the whole
pagetable for correctness, which is clearly a performance problem.

Xen solves this with pinning which keeps a pagetable effectively
active even if its currently unused, which means that all the normal
update rules are enforced.  This means that it need not revalidate the
pagetable when loading cr3.

This patch has a first-cut implementation of pinning, but it is more
fully implemented in a later patch.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/Makefile    |   2 +-
 arch/i386/xen/enlighten.c |  30 +++-
 arch/i386/xen/mmu.c       | 420 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/mmu.h       |  47 ++++++
 4 files changed, 494 insertions(+), 5 deletions(-)
 create mode 100644 arch/i386/xen/mmu.c
 create mode 100644 arch/i386/xen/mmu.h

(limited to 'arch')

diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index 60bc1cfb101c..803c1ee2b768 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1 +1 @@
-obj-y		:= enlighten.o setup.o features.o multicalls.o
+obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 2d484f9320de..c0b0aa7af145 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -39,6 +39,7 @@
 #include <asm/pgtable.h>
 
 #include "xen-ops.h"
+#include "mmu.h"
 #include "multicalls.h"
 
 EXPORT_SYMBOL_GPL(hypercall_page);
@@ -579,11 +580,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 		 * Should be set_fixmap(), but shared_info is a machine
 		 * address with no corresponding pseudo-phys address.
 		 */
-#if 0
 		set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
 			    PFN_DOWN(xen_start_info->shared_info),
 			    PAGE_KERNEL);
-#endif
 
 		HYPERVISOR_shared_info =
 			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
@@ -592,9 +591,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 		HYPERVISOR_shared_info =
 			(struct shared_info *)__va(xen_start_info->shared_info);
 
-#if 0
 	xen_pgd_pin(base);
-#endif
 
 	xen_vcpu_setup(smp_processor_id());
 }
@@ -690,6 +687,31 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.release_pd = xen_release_pd,
 	.release_pt = xen_release_pt,
 
+	.set_pte = xen_set_pte,
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd,
+
+	.pte_val = xen_pte_val,
+	.pgd_val = xen_pgd_val,
+
+	.make_pte = xen_make_pte,
+	.make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte_atomic,
+	.set_pte_present = xen_set_pte_at,
+	.set_pud = xen_set_pud,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+
+	.make_pmd = xen_make_pmd,
+	.pmd_val = xen_pmd_val,
+#endif	/* PAE */
+
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
 	.set_lazy_mode = xen_set_lazy_mode,
 };
 
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..de16cb5f55ca
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,420 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/paravirt.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+	pte_t *pte = lookup_address(address);
+	unsigned offset = address & PAGE_MASK;
+
+	BUG_ON(pte == NULL);
+
+	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_wrprotect(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_mkwrite(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptep).maddr;
+	u.val = pte_val_ma(pte);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pmd_val_ma(val);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pud_val_ma(val);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+#endif
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	/* <mfn,flags> stored as-is, to permit clearing entries */
+	xen_set_pte(pte, mfn_pte(mfn, flags));
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval)
+{
+	if ((mm != current->mm && mm != &init_mm) ||
+	    HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
+		xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();		/* make sure low gets written first */
+	ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+	xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+	unsigned long long ret = 0;
+
+	if (pte.pte_low) {
+		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	}
+
+	return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+	unsigned long long ret = pmd.pmd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+	if (pte & 1)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+	if (pmd & 1)
+		pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+	return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+unsigned long xen_pte_val(pte_t pte)
+{
+	unsigned long ret = pte.pte_low;
+
+	if (ret & _PAGE_PRESENT)
+		ret = machine_to_phys(XMADDR(ret)).paddr;
+
+	return ret;
+}
+
+unsigned long xen_pmd_val(pmd_t pmd)
+{
+	/* a BUG here is a lot easier to track down than a NULL eip */
+	BUG();
+	return 0;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+	if (pte & _PAGE_PRESENT)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte };
+}
+
+pmd_t xen_make_pmd(unsigned long pmd)
+{
+	/* a BUG here is a lot easier to track down than a NULL eip */
+	BUG();
+	return __pmd(0);
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#endif	/* CONFIG_X86_PAE */
+
+
+
+static void pgd_walk_set_prot(void *pt, pgprot_t flags)
+{
+	unsigned long pfn = PFN_DOWN(__pa(pt));
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)pt,
+					 pfn_pte(pfn, flags), 0) < 0)
+		BUG();
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+	pgd_t *pgd = pgd_base;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int    g, u, m;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+
+		if (PTRS_PER_PUD > 1) /* not folded */
+			pgd_walk_set_prot(pud, flags);
+
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (pud_none(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+
+			if (PTRS_PER_PMD > 1) /* not folded */
+				pgd_walk_set_prot(pmd, flags);
+
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+				if (pmd_none(*pmd))
+					continue;
+
+				/* This can get called before mem_map
+				   is set up, so we assume nothing is
+				   highmem at that point. */
+				if (mem_map == NULL ||
+				    !PageHighMem(pmd_page(*pmd))) {
+					pte = pte_offset_kernel(pmd, 0);
+					pgd_walk_set_prot(pte, flags);
+				}
+			}
+		}
+	}
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+					 pfn_pte(PFN_DOWN(__pa(pgd_base)),
+						 flags),
+					 UVMF_TLB_FLUSH) < 0)
+		BUG();
+}
+
+
+/* This is called just after a mm has been duplicated from its parent,
+   but it has not been used yet.  We need to make sure that its
+   pagetable is all read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+	struct mmuext_op op;
+
+	pgd_walk(pgd, PAGE_KERNEL_RO);
+
+#if defined(CONFIG_X86_PAE)
+	op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+	op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+/* Release a pagetables pages back as normal RW */
+void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op op;
+
+	op.cmd = MMUEXT_UNPIN_TABLE;
+	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+
+	pgd_walk(pgd, PAGE_KERNEL);
+}
+
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+	xen_pgd_pin(next->pgd);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	xen_pgd_pin(mm->pgd);
+}
+
+void xen_exit_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+
+	/*
+	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+	 */
+	if (tsk->active_mm == mm) {
+		tsk->active_mm = &init_mm;
+		atomic_inc(&init_mm.mm_count);
+
+		switch_mm(mm, &init_mm, tsk);
+
+		atomic_dec(&mm->mm_count);
+		BUG_ON(atomic_read(&mm->mm_count) == 0);
+	}
+
+	task_unlock(tsk);
+
+	xen_pgd_unpin(mm->pgd);
+}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..764eaaae0a2f
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,47 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif	/* _XEN_MMU_H */
-- 
cgit v1.2.3


From e46cdb66c8fc1c8d61cfae0f219ff47ac4b9d531 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: event channels

Xen implements interrupts in terms of event channels.  Each guest
domain gets 1024 event channels which can be used for a variety of
purposes, such as Xen timer events, inter-domain events,
inter-processor events (IPI) or for real hardware IRQs.

Within the kernel, we map the event channels to IRQs, and implement
the whole interrupt handling using a Xen irq_chip.

Rather than setting NR_IRQ to 1024 under PARAVIRT in order to
accomodate Xen, we create a dynamic mapping between event channels and
IRQs.  Ideally, Linux will eventually move towards dynamically
allocating per-irq structures, and we can use a 1:1 mapping between
event channels and irqs.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/i386/xen/Makefile    |   3 +-
 arch/i386/xen/enlighten.c |   1 +
 arch/i386/xen/events.c    | 511 ++++++++++++++++++++++++++++++++++++++++++++++
 include/xen/events.h      |  28 +++
 4 files changed, 542 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/xen/events.c
 create mode 100644 include/xen/events.h

(limited to 'arch')

diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index 803c1ee2b768..7a78f27bfb16 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1 +1,2 @@
-obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o
+obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
+			events.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index c0b0aa7af145..6417dfdccb4c 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -607,6 +607,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
+	.init_IRQ = xen_init_IRQ,
 
 	.cpuid = xen_cpuid,
 
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..e7c5d00ab4fe
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,511 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+	unsigned short evtchn;
+	unsigned char index;
+	unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+	[0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] &
+		cpu_evtchn_mask[cpu][idx] &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	int irq = evtchn_to_irq[chn];
+
+	BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+	__set_bit(chn, cpu_evtchn_mask[cpu]);
+
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+	int i;
+	/* By default all event channels notify CPU#0. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+
+	BUG_ON(!irqs_disabled());
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+		sync_clear_bit(port, &s->evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / BITS_PER_LONG,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+	int irq;
+
+	/* Only allocate from dynirq range */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		if (irq_bindcount[irq] == 0)
+			break;
+
+	if (irq == NR_IRQS)
+		panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+	return irq;
+}
+
+static int bind_evtchn_to_irq(unsigned int evtchn)
+{
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = evtchn_to_irq[evtchn];
+
+	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "event");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	if (irq == -1) {
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "virq");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_irq(irq);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		irq_info[irq] = IRQ_UNBOUND;
+
+		dynamic_irq_init(irq);
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags,
+			      const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_evtchn_to_irq(evtchn);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_virq_to_irq(virq, cpu);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	int cpu = get_cpu();
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+	unsigned long pending_words;
+
+	vcpu_info->evtchn_upcall_pending = 0;
+
+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+	pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+	while (pending_words != 0) {
+		unsigned long pending_bits;
+		int word_idx = __ffs(pending_words);
+		pending_words &= ~(1UL << word_idx);
+
+		while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+			int bit_idx = __ffs(pending_bits);
+			int port = (word_idx * BITS_PER_LONG) + bit_idx;
+			int irq = evtchn_to_irq[port];
+
+			if (irq != -1) {
+				regs->orig_eax = ~irq;
+				do_IRQ(regs);
+			}
+		}
+	}
+
+	put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+	struct evtchn_bind_vcpu bind_vcpu;
+	int evtchn = evtchn_from_irq(irq);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	/* Send future instances of this interrupt to other vcpu. */
+	bind_vcpu.port = evtchn;
+	bind_vcpu.vcpu = tcpu;
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+	unsigned tcpu = first_cpu(dest);
+	rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	move_native_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	int ret = 0;
+
+	if (VALID_EVTCHN(evtchn)) {
+		set_evtchn(evtchn);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+	.name		= "xen-dyn",
+	.mask		= disable_dynirq,
+	.unmask		= enable_dynirq,
+	.ack		= ack_dynirq,
+	.set_affinity	= set_affinity_irq,
+	.retrigger	= retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+	int i;
+
+	init_evtchn_cpu_bindings();
+
+	/* No event channels are 'live' right now. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		mask_evtchn(i);
+
+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_bindcount[i] = 0;
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/include/xen/events.h b/include/xen/events.h
new file mode 100644
index 000000000000..77f71c906823
--- /dev/null
+++ b/include/xen/events.h
@@ -0,0 +1,28 @@
+#ifndef _XEN_EVENTS_H
+#define _XEN_EVENTS_H
+
+#include <linux/irq.h>
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id);
+
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (even for bindings
+ * made with bind_evtchn_to_irqhandler()).
+ */
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+static inline void notify_remote_via_evtchn(int port)
+{
+	struct evtchn_send send = { .port = port };
+	(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
+}
+
+extern void notify_remote_via_irq(int irq);
+#endif	/* _XEN_EVENTS_H */
-- 
cgit v1.2.3


From 15c84731d647c34d1491793fa6be96f5de3432eb Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: time implementation

Xen maintains a base clock which measures nanoseconds since system
boot.  This is provided to guests via a shared page which contains a
base time in ns, a tsc timestamp at that point and tsc frequency
parameters.  Guests can compute the current time by reading the tsc
and using it to extrapolate the current time from the basetime.  The
hypervisor makes sure that the frequency parameters are updated
regularly, paricularly if the tsc changes rate or stops.

This is implemented as a clocksource, so the interface to the rest of
the kernel is a simple clocksource which simply returns the current
time directly in nanoseconds.

Xen also provides a simple timer mechanism, which allows a timeout to
be set in the future.  When that time arrives, a timer event is sent
to the guest.  There are two timer interfaces:
 - An old one which also delivers a stream of (unused) ticks at 100Hz,
   and on the same event, the actual timer events.  The 100Hz ticks
   cause a lot of spurious wakeups, but are basically harmless.
 - The new timer interface doesn't have the 100Hz ticks, and can also
   fail if the specified time is in the past.

This code presents the Xen timer as a clockevent driver, and uses the
new interface by preference.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/i386/xen/Makefile    |   2 +-
 arch/i386/xen/enlighten.c |   6 +
 arch/i386/xen/time.c      | 407 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/xen/time.c

(limited to 'arch')

diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index 7a78f27bfb16..bf51cabed0d2 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1,2 +1,2 @@
 obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
-			events.o
+			events.o time.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 6417dfdccb4c..25eb3592f11d 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -609,6 +609,12 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.arch_setup = xen_arch_setup,
 	.init_IRQ = xen_init_IRQ,
 
+	.time_init = xen_time_init,
+	.set_wallclock = xen_set_wallclock,
+	.get_wallclock = xen_get_wallclock,
+	.get_cpu_khz = xen_cpu_khz,
+	.sched_clock = xen_clocksource_read,
+
 	.cpuid = xen_cpuid,
 
 	.set_debugreg = xen_set_debugreg,
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..b457980ff3c2
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,407 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP	100000
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	int tsc_shift;
+	u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+unsigned long xen_cpu_khz(void)
+{
+	u64 cpu_khz = 1000000ULL << 32;
+	const struct vcpu_time_info *info =
+		&HYPERVISOR_shared_info->vcpu_info[0].time;
+
+	do_div(cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz <<= -info->tsc_shift;
+	else
+		cpu_khz >>= info->tsc_shift;
+
+	return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(void)
+{
+	struct vcpu_time_info   *src;
+	struct shadow_time_info *dst;
+
+	preempt_disable();
+
+	/* src is shared memory with the hypervisor, so we need to
+	   make sure we get a consistent snapshot, even in the face of
+	   being preempted. */
+	src = &__get_cpu_var(xen_vcpu)->time;
+	dst = &__get_cpu_var(shadow_time);
+
+	do {
+		dst->version = src->version;
+		rmb();		/* fetch version before data */
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();		/* test version after fetching data */
+	} while ((src->version & 1) | (dst->version ^ src->version));
+
+	preempt_enable();
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	rdtscll(now);
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+cycle_t xen_clocksource_read(void)
+{
+	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+	cycle_t ret;
+
+	get_time_values_from_xen();
+
+	ret = shadow->system_timestamp + get_nsec_offset(shadow);
+
+	put_cpu_var(shadow_time);
+
+	return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+	const struct shared_info *s = HYPERVISOR_shared_info;
+	u32 version;
+	u64 delta;
+	struct timespec now;
+
+	/* get wallclock at system boot */
+	do {
+		version = s->wc_version;
+		rmb();		/* fetch version before time */
+		now.tv_sec  = s->wc_sec;
+		now.tv_nsec = s->wc_nsec;
+		rmb();		/* fetch time before checking version */
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+	delta = xen_clocksource_read();	/* time since system boot */
+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+	now.tv_sec = delta;
+
+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+	struct timespec ts;
+
+	xen_read_wallclock(&ts);
+
+	return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+	/* do nothing for domU */
+	return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+	.name = "xen",
+	.rating = 400,
+	.read = xen_clocksource_read,
+	.mask = ~0,
+	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
+	.shift = XEN_SHIFT,
+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+   Xen clockevent implementation
+
+   Xen has two clockevent implementations:
+
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+
+
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+	return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* unsupported */
+		WARN_ON(1);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+		break;
+	}
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+				      struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+		BUG();
+
+	/* We may have missed the deadline, but there's no real way of
+	   knowing for sure.  If the event was in the past, then we'll
+	   get an immediate interrupt. */
+
+	return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_timerop_set_mode,
+	.set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+				struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1);	/* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+	}
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+				     struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+	struct vcpu_set_singleshot_timer single;
+	int ret;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	single.timeout_abs_ns = get_abs_timeout(delta);
+	single.flags = VCPU_SSHOTTMR_future;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+	BUG_ON(ret != 0 && ret != -ETIME);
+
+	return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_vcpuop_set_mode,
+	.set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+	&xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+	irqreturn_t ret;
+
+	ret = IRQ_NONE;
+	if (evt->event_handler) {
+		evt->event_handler(evt);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static void xen_setup_timer(int cpu)
+{
+	const char *name;
+	struct clock_event_device *evt;
+	int irq;
+
+	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+	if (!name)
+		name = "<timer kasprintf failed>";
+
+	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				      name, NULL);
+
+	evt = &get_cpu_var(xen_clock_events);
+	memcpy(evt, xen_clockevent, sizeof(*evt));
+
+	evt->cpumask = cpumask_of_cpu(cpu);
+	evt->irq = irq;
+	clockevents_register_device(evt);
+
+	put_cpu_var(xen_clock_events);
+}
+
+__init void xen_time_init(void)
+{
+	int cpu = smp_processor_id();
+
+	get_time_values_from_xen();
+
+	clocksource_register(&xen_clocksource);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+		/* Successfully turned off 100hz tick, so we have the
+		   vcpuop-based timer interface */
+		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+		xen_clockevent = &xen_vcpuop_clockevent;
+	}
+
+	/* Set initial system time with full resolution */
+	xen_read_wallclock(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+	tsc_disable = 0;
+
+	xen_setup_timer(cpu);
+}
-- 
cgit v1.2.3


From e738fca8d7dffec30eeee231c38f128eed56c8c8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: configuration

Put config options for Xen after the core pieces are in place.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/Kconfig     |  2 ++
 arch/i386/xen/Kconfig | 11 +++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 arch/i386/xen/Kconfig

(limited to 'arch')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index c7c9c2a15fab..7a11b905ef49 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -222,6 +222,8 @@ config PARAVIRT
 	  However, when run without a hypervisor the kernel is
 	  theoretically slower.  If in doubt, say N.
 
+source "arch/i386/xen/Kconfig"
+
 config VMI
 	bool "VMI Paravirt-ops support"
 	depends on PARAVIRT
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..7c5550058c15
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+	bool "Enable support for Xen hypervisor"
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES)
+	help
+	  This is the Linux Xen port.  Enabling this will allow the
+	  kernel to boot in a paravirtualized environment under the
+	  Xen hypervisor.
-- 
cgit v1.2.3


From f4f97b3ea90130520afb478cbc2918be2b6587b8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: Complete pagetable pinning

Xen requires all active pagetables to be marked read-only.  When the
base of the pagetable is loaded into %cr3, the hypervisor validates
the entire pagetable and only allows the load to proceed if it all
checks out.

This is pretty slow, so to mitigate this cost Xen has a notion of
pinned pagetables.  Pinned pagetables are pagetables which are
considered to be active even if no processor's cr3 is pointing to is.
This means that it must remain read-only and all updates are validated
by the hypervisor.  This makes context switches much cheaper, because
the hypervisor doesn't need to revalidate the pagetable each time.

This also adds a new paravirt hook which is called during setup once
the zones and memory allocator have been initialized.  When the
init_mm pagetable is first built, the struct page array does not yet
exist, and so there's nowhere to put he init_mm pagetable's PG_pinned
flags.  Once the zones are initialized and the struct page array
exists, we can set the PG_pinned flags for those pages.

This patch also adds the Xen support for pte pages allocated out of
highmem (highpte) by implementing xen_kmap_atomic_pte.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Zach Amsden <zach@vmware.com>
---
 arch/i386/xen/enlighten.c |  87 ++++++++++++----
 arch/i386/xen/mmu.c       | 260 ++++++++++++++++++++++++++++++----------------
 arch/i386/xen/mmu.h       |   2 +-
 arch/i386/xen/xen-ops.h   |   2 +
 4 files changed, 242 insertions(+), 109 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 25eb3592f11d..86e68e680116 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -21,6 +21,9 @@
 #include <linux/sched.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
@@ -500,32 +503,59 @@ static void xen_write_cr3(unsigned long cr3)
 	}
 }
 
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
 {
-	/* XXX pfn isn't necessarily a lowmem page */
+	BUG_ON(mem_map);	/* should only be used early */
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
 
-static void xen_alloc_pd(u32 pfn)
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
 {
-	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
+	struct page *page = pfn_to_page(pfn);
 
-static void xen_release_pd(u32 pfn)
-{
-	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		SetPagePinned(page);
+
+		if (!PageHighMem(page))
+			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+		else
+			/* make sure there are no stray mappings of
+			   this page */
+			kmap_flush_unused();
+	}
 }
 
+/* This should never happen until we're OK to use struct page */
 static void xen_release_pt(u32 pfn)
 {
-	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(page)) {
+		if (!PageHighMem(page))
+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	}
 }
 
-static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn,
-					u32 start, u32 count)
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 {
-	xen_alloc_pd(pfn);
+	pgprot_t prot = PAGE_KERNEL;
+
+	if (PagePinned(page))
+		prot = PAGE_KERNEL_RO;
+
+	if (0 && PageHighMem(page))
+		printk("mapping highpte %lx type %d prot %s\n",
+		       page_to_pfn(page), type,
+		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+	return kmap_atomic_prot(page, type, prot);
 }
+#endif
 
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
@@ -553,7 +583,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
 				       PAGE_SIZE);
 
-				xen_alloc_pd(PFN_DOWN(__pa(pmd)));
+				make_lowmem_page_readonly(pmd);
 
 				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
 			} else
@@ -574,6 +604,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 
 static __init void xen_pagetable_setup_done(pgd_t *base)
 {
+	/* This will work as long as patching hasn't happened yet
+	   (which it hasn't) */
+	paravirt_ops.alloc_pt = xen_alloc_pt;
+
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/*
 		 * Create a mapping for the shared info page.
@@ -591,7 +625,19 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 		HYPERVISOR_shared_info =
 			(struct shared_info *)__va(xen_start_info->shared_info);
 
-	xen_pgd_pin(base);
+	/* Actually pin the pagetable down, but we can't set PG_pinned
+	   yet because the page structures don't exist yet. */
+	{
+		struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+			BUG();
+	}
 
 	xen_vcpu_setup(smp_processor_id());
 }
@@ -608,6 +654,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
 	.init_IRQ = xen_init_IRQ,
+	.post_allocator_init = xen_mark_init_mm_pinned,
 
 	.time_init = xen_time_init,
 	.set_wallclock = xen_set_wallclock,
@@ -688,11 +735,15 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.pagetable_setup_start = xen_pagetable_setup_start,
 	.pagetable_setup_done = xen_pagetable_setup_done,
 
-	.alloc_pt = xen_alloc_pt,
-	.alloc_pd = xen_alloc_pd,
-	.alloc_pd_clone = xen_alloc_pd_clone,
-	.release_pd = xen_release_pd,
+	.alloc_pt = xen_alloc_pt_init,
 	.release_pt = xen_release_pt,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
 
 	.set_pte = xen_set_pte,
 	.set_pte_at = xen_set_pte_at,
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index de16cb5f55ca..53501ce2d15c 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -38,19 +38,22 @@
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
+#include <linux/highmem.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/paravirt.h>
 
 #include <asm/xen/hypercall.h>
-#include <asm/paravirt.h>
+#include <asm/xen/hypervisor.h>
 
 #include <xen/page.h>
 #include <xen/interface/xen.h>
 
+#include "multicalls.h"
 #include "mmu.h"
 
 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
@@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr)
 }
 
 
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-	struct mmu_update u;
-
-	u.ptr = virt_to_machine(ptep).maddr;
-	u.val = pte_val_ma(pte);
-	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
-		BUG();
-}
-
 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
 	struct mmu_update u;
@@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
 		BUG();
 }
 
-#ifdef CONFIG_X86_PAE
-void xen_set_pud(pud_t *ptr, pud_t val)
-{
-	struct mmu_update u;
-
-	u.ptr = virt_to_machine(ptr).maddr;
-	u.val = pud_val_ma(val);
-	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
-		BUG();
-}
-#endif
-
 /*
  * Associate a virtual page frame with a given physical page frame
  * and protection flags for that frame.
@@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 }
 
 #ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pud_val_ma(val);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((u64 *)ptep, pte_val_ma(pte));
@@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd)
 	return (pgd_t){ pgd };
 }
 #else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	*ptep = pte;
+}
+
 unsigned long xen_pte_val(pte_t pte)
 {
 	unsigned long ret = pte.pte_low;
@@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte)
 	return ret;
 }
 
-unsigned long xen_pmd_val(pmd_t pmd)
-{
-	/* a BUG here is a lot easier to track down than a NULL eip */
-	BUG();
-	return 0;
-}
-
 unsigned long xen_pgd_val(pgd_t pgd)
 {
 	unsigned long ret = pgd.pgd;
@@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte)
 	return (pte_t){ pte };
 }
 
-pmd_t xen_make_pmd(unsigned long pmd)
-{
-	/* a BUG here is a lot easier to track down than a NULL eip */
-	BUG();
-	return __pmd(0);
-}
-
 pgd_t xen_make_pgd(unsigned long pgd)
 {
 	if (pgd & _PAGE_PRESENT)
@@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd)
 
 
-static void pgd_walk_set_prot(void *pt, pgprot_t flags)
-{
-	unsigned long pfn = PFN_DOWN(__pa(pt));
-
-	if (HYPERVISOR_update_va_mapping((unsigned long)pt,
-					 pfn_pte(pfn, flags), 0) < 0)
-		BUG();
-}
-
-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+		    unsigned long limit)
 {
 	pgd_t *pgd = pgd_base;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int    g, u, m;
+	int flush = 0;
+	unsigned long addr = 0;
+	unsigned long pgd_next;
+
+	BUG_ON(limit > FIXADDR_TOP);
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
+		return 0;
+
+	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+		pud_t *pud;
+		unsigned long pud_limit, pud_next;
 
-	for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
-		if (pgd_none(*pgd))
+		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+		if (!pgd_val(*pgd))
 			continue;
+
 		pud = pud_offset(pgd, 0);
 
 		if (PTRS_PER_PUD > 1) /* not folded */
-			pgd_walk_set_prot(pud, flags);
+			flush |= (*func)(virt_to_page(pud), 0);
+
+		for (; addr != pud_limit; pud++, addr = pud_next) {
+			pmd_t *pmd;
+			unsigned long pmd_limit;
+
+			pud_next = pud_addr_end(addr, pud_limit);
+
+			if (pud_next < limit)
+				pmd_limit = pud_next;
+			else
+				pmd_limit = limit;
 
-		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
 			if (pud_none(*pud))
 				continue;
+
 			pmd = pmd_offset(pud, 0);
 
 			if (PTRS_PER_PMD > 1) /* not folded */
-				pgd_walk_set_prot(pmd, flags);
+				flush |= (*func)(virt_to_page(pmd), 0);
+
+			for (; addr != pmd_limit; pmd++) {
+				addr += (PAGE_SIZE * PTRS_PER_PTE);
+				if ((pmd_limit-1) < (addr-1)) {
+					addr = pmd_limit;
+					break;
+				}
 
-			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
 				if (pmd_none(*pmd))
 					continue;
 
-				/* This can get called before mem_map
-				   is set up, so we assume nothing is
-				   highmem at that point. */
-				if (mem_map == NULL ||
-				    !PageHighMem(pmd_page(*pmd))) {
-					pte = pte_offset_kernel(pmd, 0);
-					pgd_walk_set_prot(pte, flags);
-				}
+				flush |= (*func)(pmd_page(*pmd), 0);
 			}
 		}
 	}
 
-	if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
-					 pfn_pte(PFN_DOWN(__pa(pgd_base)),
-						 flags),
-					 UVMF_TLB_FLUSH) < 0)
-		BUG();
+	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+	return flush;
 }
 
+static int pin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+	int flush;
+
+	if (pgfl)
+		flush = 0;		/* already pinned */
+	else if (PageHighMem(page))
+		/* kmaps need flushing if we found an unpinned
+		   highpage */
+		flush = 1;
+	else {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		flush = 0;
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL_RO),
+					flags);
+	}
+
+	return flush;
+}
 
-/* This is called just after a mm has been duplicated from its parent,
-   but it has not been used yet.  We need to make sure that its
-   pagetable is all read-only, and can be pinned. */
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	struct mmuext_op op;
+	struct multicall_space mcs;
+	struct mmuext_op *op;
 
-	pgd_walk(pgd, PAGE_KERNEL_RO);
+	xen_mc_batch();
 
-#if defined(CONFIG_X86_PAE)
-	op.cmd = MMUEXT_PIN_L3_TABLE;
+	if (pgd_walk(pgd, pin_page, TASK_SIZE))
+		kmap_flush_unused();
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+	op->cmd = MMUEXT_PIN_L3_TABLE;
 #else
-	op.cmd = MMUEXT_PIN_L2_TABLE;
+	op->cmd = MMUEXT_PIN_L2_TABLE;
 #endif
-	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
-		BUG();
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(0);
 }
 
-/* Release a pagetables pages back as normal RW */
-void xen_pgd_unpin(pgd_t *pgd)
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
 {
-	struct mmuext_op op;
+	SetPagePinned(page);
+	return 0;
+}
 
-	op.cmd = MMUEXT_UNPIN_TABLE;
-	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+void __init xen_mark_init_mm_pinned(void)
+{
+	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
 
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
-		BUG();
+static int unpin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
 
-	pgd_walk(pgd, PAGE_KERNEL);
+	if (pgfl && !PageHighMem(page)) {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL),
+					flags);
+	}
+
+	return 0;		/* never need to flush on unpin */
 }
 
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	xen_mc_batch();
+
+	mcs = __xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_UNPIN_TABLE;
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+	xen_mc_issue(0);
+}
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
+	spin_lock(&next->page_table_lock);
 	xen_pgd_pin(next->pgd);
+	spin_unlock(&next->page_table_lock);
 }
 
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
+	spin_lock(&mm->page_table_lock);
 	xen_pgd_pin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
 }
 
 void xen_exit_mmap(struct mm_struct *mm)
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
index 764eaaae0a2f..49776fe9f02a 100644
--- a/arch/i386/xen/mmu.h
+++ b/arch/i386/xen/mmu.h
@@ -15,7 +15,7 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
 void xen_exit_mmap(struct mm_struct *mm);
 
 void xen_pgd_pin(pgd_t *pgd);
-void xen_pgd_unpin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
 
 #ifdef CONFIG_X86_PAE
 unsigned long long xen_pte_val(pte_t);
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 79648fe1ab77..54d98b52085e 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -20,6 +20,8 @@ unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 cycle_t xen_clocksource_read(void);
 
+void xen_mark_init_mm_pinned(void);
+
 DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
 
 static inline unsigned xen_get_lazy_mode(void)
-- 
cgit v1.2.3


From 9a4029fd3409eb224eb62c32d9792071382694ec Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: ignore RW mapping of RO pages in pagetable_init

When setting up the initial pagetable, which includes mappings of all
low physical memory, ignore a mapping which tries to set the RW bit on
an RO pte.  An RO pte indicates a page which is part of the current
pagetable, and so it cannot be allowed to become RW.

Once xen_pagetable_setup_done is called, set_pte reverts to its normal
behaviour.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Cc: ebiederm@xmission.com (Eric W. Biederman)
---
 arch/i386/xen/enlighten.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 86e68e680116..9550ae3b1fb1 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -505,7 +505,7 @@ static void xen_write_cr3(unsigned long cr3)
 
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
-static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
 {
 	BUG_ON(mem_map);	/* should only be used early */
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
@@ -557,10 +557,32 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+			       pte_val_ma(pte));
+
+	return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+	pte = mask_rw_pte(ptep, pte);
+
+	xen_set_pte(ptep, pte);
+}
+
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
 
+	/* special set_pte for pagetable initialization */
+	paravirt_ops.set_pte = xen_set_pte_init;
+
 	init_mm.pgd = base;
 	/*
 	 * copy top-level of Xen-supplied pagetable into place.	 For
@@ -607,6 +629,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 	/* This will work as long as patching hasn't happened yet
 	   (which it hasn't) */
 	paravirt_ops.alloc_pt = xen_alloc_pt;
+	paravirt_ops.set_pte = xen_set_pte;
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/*
@@ -745,7 +768,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
 #endif
 
-	.set_pte = xen_set_pte,
+	.set_pte = NULL,	/* see xen_pagetable_setup_* */
 	.set_pte_at = xen_set_pte_at,
 	.set_pmd = xen_set_pmd,
 
-- 
cgit v1.2.3


From f91a8b447b9af64f589f6e13fec7f09b5927563d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: Account for stolen time

This patch accounts for the time stolen from our VCPUs.  Stolen time is
time where a vcpu is runnable and could be running, but all available
physical CPUs are being used for something else.

This accounting gets run on each timer interrupt, just as a way to get
it run relatively often, and when interesting things are going on.
Stolen time is not really used by much in the kernel; it is reported
in /proc/stats, and that's about it.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
---
 arch/i386/xen/time.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 150 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index b457980ff3c2..acbfd9969462 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
@@ -25,6 +26,7 @@
 
 /* Xen may fire a timer up to this many ns early */
 #define TIMER_SLOP	100000
+#define NS_PER_TICK	(1000000000LL / HZ)
 
 /* These are perodically updated in shared_info, and then copied here. */
 struct shadow_time_info {
@@ -37,6 +39,139 @@ struct shadow_time_info {
 
 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
 
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+	u64 ret;
+
+	if (BITS_PER_LONG < 64) {
+		u32 *p32 = (u32 *)p;
+		u32 h, l;
+
+		/*
+		 * Read high then low, and then make sure high is
+		 * still the same; this will only loop if low wraps
+		 * and carries into high.
+		 * XXX some clean way to make this endian-proof?
+		 */
+		do {
+			h = p32[1];
+			barrier();
+			l = p32[0];
+			barrier();
+		} while (p32[1] != h);
+
+		ret = (((u64)h) << 32) | l;
+	} else
+		ret = *p;
+
+	return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+	u64 state_time;
+	struct vcpu_runstate_info *state;
+
+	preempt_disable();
+
+	state = &__get_cpu_var(runstate);
+
+	/*
+	 * The runstate info is always updated by the hypervisor on
+	 * the current CPU, so there's no need to use anything
+	 * stronger than a compiler barrier when fetching it.
+	 */
+	do {
+		state_time = get64(&state->state_entry_time);
+		barrier();
+		*res = *state;
+		barrier();
+	} while (get64(&state->state_entry_time) != state_time);
+
+	preempt_enable();
+}
+
+static void setup_runstate_info(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+
+	area.addr.v = &per_cpu(runstate, cpu);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+			       cpu, &area))
+		BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+	struct vcpu_runstate_info state;
+	struct vcpu_runstate_info *snap;
+	s64 blocked, runnable, offline, stolen;
+	cputime_t ticks;
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	snap = &__get_cpu_var(runstate_snapshot);
+
+	/* work out how much time the VCPU has not been runn*ing*  */
+	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+	*snap = state;
+
+	/* Add the appropriate number of ticks of stolen time,
+	   including any left-overs from last time.  Passing NULL to
+	   account_steal_time accounts the time as stolen. */
+	stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+	if (stolen < 0)
+		stolen = 0;
+
+	ticks = 0;
+	while (stolen >= NS_PER_TICK) {
+		ticks++;
+		stolen -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_stolen) = stolen;
+	account_steal_time(NULL, ticks);
+
+	/* Add the appropriate number of ticks of blocked time,
+	   including any left-overs from last time.  Passing idle to
+	   account_steal_time accounts the time as idle/wait. */
+	blocked += __get_cpu_var(residual_blocked);
+
+	if (blocked < 0)
+		blocked = 0;
+
+	ticks = 0;
+	while (blocked >= NS_PER_TICK) {
+		ticks++;
+		blocked -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_blocked) = blocked;
+	account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+
+
+/* Get the CPU speed from Xen */
 unsigned long xen_cpu_khz(void)
 {
 	u64 cpu_khz = 1000000ULL << 32;
@@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void)
  * Reads a consistent set of time-base values from Xen, into a shadow data
  * area.
  */
-static void get_time_values_from_xen(void)
+static unsigned get_time_values_from_xen(void)
 {
 	struct vcpu_time_info   *src;
 	struct shadow_time_info *dst;
 
-	preempt_disable();
-
 	/* src is shared memory with the hypervisor, so we need to
 	   make sure we get a consistent snapshot, even in the face of
 	   being preempted. */
@@ -79,7 +212,7 @@ static void get_time_values_from_xen(void)
 		rmb();		/* test version after fetching data */
 	} while ((src->version & 1) | (dst->version ^ src->version));
 
-	preempt_enable();
+	return dst->version;
 }
 
 /*
@@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
 static u64 get_nsec_offset(struct shadow_time_info *shadow)
 {
 	u64 now, delta;
-	rdtscll(now);
+	now = native_read_tsc();
 	delta = now - shadow->tsc_timestamp;
 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
 }
@@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void)
 {
 	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
 	cycle_t ret;
+	unsigned version;
 
-	get_time_values_from_xen();
-
-	ret = shadow->system_timestamp + get_nsec_offset(shadow);
+	do {
+		version = get_time_values_from_xen();
+		barrier();
+		ret = shadow->system_timestamp + get_nsec_offset(shadow);
+		barrier();
+	} while (version != __get_cpu_var(xen_vcpu)->time.version);
 
 	put_cpu_var(shadow_time);
 
@@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 		ret = IRQ_HANDLED;
 	}
 
+	do_stolen_accounting();
+
 	return ret;
 }
 
@@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu)
 	evt->irq = irq;
 	clockevents_register_device(evt);
 
+	setup_runstate_info(cpu);
+
 	put_cpu_var(xen_clock_events);
 }
 
@@ -390,7 +531,7 @@ __init void xen_time_init(void)
 	clocksource_register(&xen_clocksource);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
-		/* Successfully turned off 100hz tick, so we have the
+		/* Successfully turned off 100Hz tick, so we have the
 		   vcpuop-based timer interface */
 		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 		xen_clockevent = &xen_vcpuop_clockevent;
-- 
cgit v1.2.3


From ab55028886dd1dd54585f22bf19a00eb23869340 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: Implement sched_clock

Implement xen_sched_clock, which returns the number of ns the current
vcpu has been actually in an unstolen state (ie, running or blocked,
vs runnable-but-not-running, or offline) since boot.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Cc: john stultz <johnstul@us.ibm.com>
---
 arch/i386/xen/enlighten.c |  2 +-
 arch/i386/xen/time.c      | 27 ++++++++++++++++++++++++++-
 arch/i386/xen/xen-ops.h   |  3 +--
 3 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 9550ae3b1fb1..a9ba834295a2 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -683,7 +683,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
 	.get_cpu_khz = xen_cpu_khz,
-	.sched_clock = xen_clocksource_read,
+	.sched_clock = xen_sched_clock,
 
 	.cpuid = xen_cpuid,
 
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index acbfd9969462..2aab44bec2a5 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -28,6 +28,8 @@
 #define TIMER_SLOP	100000
 #define NS_PER_TICK	(1000000000LL / HZ)
 
+static cycle_t xen_clocksource_read(void);
+
 /* These are perodically updated in shared_info, and then copied here. */
 struct shadow_time_info {
 	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
@@ -169,6 +171,29 @@ static void do_stolen_accounting(void)
 	account_steal_time(idle_task(smp_processor_id()), ticks);
 }
 
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+	struct vcpu_runstate_info state;
+	cycle_t now = xen_clocksource_read();
+	s64 offset;
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	offset = now - state.state_entry_time;
+	if (offset < 0)
+		offset = 0;
+
+	return state.time[RUNSTATE_blocked] +
+		state.time[RUNSTATE_running] +
+		offset;
+}
 
 
 /* Get the CPU speed from Xen */
@@ -261,7 +286,7 @@ static u64 get_nsec_offset(struct shadow_time_info *shadow)
 	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
 }
 
-cycle_t xen_clocksource_read(void)
+static cycle_t xen_clocksource_read(void)
 {
 	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
 	cycle_t ret;
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 54d98b52085e..7667abd390ec 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -2,7 +2,6 @@
 #define XEN_OPS_H
 
 #include <linux/init.h>
-#include <linux/clocksource.h>
 
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
@@ -18,7 +17,7 @@ unsigned long xen_cpu_khz(void);
 void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
-cycle_t xen_clocksource_read(void);
+unsigned long long xen_sched_clock(void);
 
 void xen_mark_init_mm_pinned(void);
 
-- 
cgit v1.2.3


From f87e4cac4f4e940b328d3deb5b53e642e3881f43 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: SMP guest support

This is a fairly straightforward Xen implementation of smp_ops.

Xen has its own IPI mechanisms, and has no dependency on any
APIC-based IPI.  The smp_ops hooks and the flush_tlb_others pv_op
allow a Xen guest to avoid all APIC code in arch/i386 (the only apic
operation is a single apic_read for the apic version number).

One subtle point which needs to be addressed is unpinning pagetables
when another cpu may have a lazy tlb reference to the pagetable. Xen
will not allow an in-use pagetable to be unpinned, so we must find any
other cpus with a reference to the pagetable and get them to shoot
down their references.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andi Kleen <ak@suse.de>
---
 arch/i386/xen/Kconfig     |   2 +-
 arch/i386/xen/Makefile    |   2 +
 arch/i386/xen/enlighten.c | 115 ++++++++++---
 arch/i386/xen/events.c    |  80 ++++++++-
 arch/i386/xen/mmu.c       |  69 ++++++--
 arch/i386/xen/mmu.h       |  13 ++
 arch/i386/xen/setup.c     |   5 +
 arch/i386/xen/smp.c       | 407 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/time.c      |  13 +-
 arch/i386/xen/xen-ops.h   |  25 +++
 include/xen/events.h      |  27 ++-
 11 files changed, 705 insertions(+), 53 deletions(-)
 create mode 100644 arch/i386/xen/smp.c

(limited to 'arch')

diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
index 7c5550058c15..b7697ff22361 100644
--- a/arch/i386/xen/Kconfig
+++ b/arch/i386/xen/Kconfig
@@ -4,7 +4,7 @@
 
 config XEN
 	bool "Enable support for Xen hypervisor"
-	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES)
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES)
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index bf51cabed0d2..fd05f243a3f8 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1,2 +1,4 @@
 obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
 			events.o time.o
+
+obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index a9ba834295a2..de62d66e0893 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -24,6 +24,7 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/highmem.h>
+#include <linux/smp.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
@@ -40,6 +41,7 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -56,7 +58,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
-static void xen_vcpu_setup(int cpu)
+void xen_vcpu_setup(int cpu)
 {
 	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 }
@@ -347,23 +349,14 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 	}
 }
 
-/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
-   hold a spinlock to protect the static traps[] array (static because
-   it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+				  struct trap_info *traps)
 {
-	static DEFINE_SPINLOCK(lock);
-	static struct trap_info traps[257];
-
-	int cpu = smp_processor_id();
 	unsigned in, out, count;
 
-	per_cpu(idt_desc, cpu) = *desc;
-
 	count = (desc->size+1) / 8;
 	BUG_ON(count > 256);
 
-	spin_lock(&lock);
 	for (in = out = 0; in < count; in++) {
 		const u32 *entry = (u32 *)(desc->address + in * 8);
 
@@ -371,6 +364,31 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
 			out++;
 	}
 	traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+	const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);
+
+	xen_convert_trap_info(desc, traps);
+
+	put_cpu_var(idt_desc);
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+	static DEFINE_SPINLOCK(lock);
+	static struct trap_info traps[257];
+	int cpu = smp_processor_id();
+
+	per_cpu(idt_desc, cpu) = *desc;
+
+	spin_lock(&lock);
+
+	xen_convert_trap_info(desc, traps);
 
 	xen_mc_flush();
 	if (HYPERVISOR_set_trap_table(traps))
@@ -428,6 +446,12 @@ static unsigned long xen_apic_read(unsigned long reg)
 {
 	return 0;
 }
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
 #endif
 
 static void xen_flush_tlb(void)
@@ -449,6 +473,40 @@ static void xen_flush_tlb_single(unsigned long addr)
 		BUG();
 }
 
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+				 unsigned long va)
+{
+	struct mmuext_op op;
+	cpumask_t cpumask = *cpus;
+
+	/*
+	 * A couple of (to be removed) sanity checks:
+	 *
+	 * - current CPU must not be in mask
+	 * - mask must exist :)
+	 */
+	BUG_ON(cpus_empty(cpumask));
+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	BUG_ON(!mm);
+
+	/* If a CPU which we ran on has gone down, OK. */
+	cpus_and(cpumask, cpumask, cpu_online_map);
+	if (cpus_empty(cpumask))
+		return;
+
+	if (va == TLB_FLUSH_ALL) {
+		op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+		op.arg2.vcpumask = (void *)cpus;
+	} else {
+		op.cmd = MMUEXT_INVLPG_MULTI;
+		op.arg1.linear_addr = va;
+		op.arg2.vcpumask = (void *)cpus;
+	}
+
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 static unsigned long xen_read_cr2(void)
 {
 	return x86_read_percpu(xen_vcpu)->arch.cr2;
@@ -460,18 +518,6 @@ static void xen_write_cr4(unsigned long cr4)
 	native_write_cr4(cr4 & ~X86_CR4_TSD);
 }
 
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
 static unsigned long xen_read_cr3(void)
 {
 	return x86_read_percpu(xen_cr3);
@@ -740,8 +786,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.io_delay = xen_io_delay,
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	.apic_write = paravirt_nop,
-	.apic_write_atomic = paravirt_nop,
+	.apic_write = xen_apic_write,
+	.apic_write_atomic = xen_apic_write,
 	.apic_read = xen_apic_read,
 	.setup_boot_clock = paravirt_nop,
 	.setup_secondary_clock = paravirt_nop,
@@ -751,6 +797,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.flush_tlb_user = xen_flush_tlb,
 	.flush_tlb_kernel = xen_flush_tlb,
 	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_others = xen_flush_tlb_others,
 
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
@@ -796,6 +843,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.set_lazy_mode = xen_set_lazy_mode,
 };
 
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = xen_smp_prepare_cpus,
+	.cpu_up = xen_cpu_up,
+	.smp_cpus_done = xen_smp_cpus_done,
+
+	.smp_send_stop = xen_smp_send_stop,
+	.smp_send_reschedule = xen_smp_send_reschedule,
+	.smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif	/* CONFIG_SMP */
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -808,6 +868,9 @@ asmlinkage void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	paravirt_ops = xen_paravirt_ops;
+#ifdef CONFIG_SMP
+	smp_ops = xen_smp_ops;
+#endif
 
 	xen_setup_features();
 
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
index e7c5d00ab4fe..4103b8bf22fd 100644
--- a/arch/i386/xen/events.c
+++ b/arch/i386/xen/events.c
@@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock);
 /* IRQ <-> VIRQ mapping. */
 static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
 
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
 /* Packed IRQ information: binding type, sub-type index, and event channel. */
 struct packed_irq
 {
@@ -58,7 +61,13 @@ struct packed_irq
 static struct packed_irq irq_info[NR_IRQS];
 
 /* Binding types. */
-enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
+enum {
+	IRQT_UNBOUND,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
 
 /* Convenient shorthand for packed representation of an unbound IRQ. */
 #define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
@@ -261,6 +270,45 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
 	return irq;
 }
 
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(ipi_to_irq, cpu)[ipi];
+	if (irq == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "ipi");
+
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+
 static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
@@ -369,6 +417,28 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
 
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+			   unsigned int cpu,
+			   irq_handler_t handler,
+			   unsigned long irqflags,
+			   const char *devname,
+			   void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
 void unbind_from_irqhandler(unsigned int irq, void *dev_id)
 {
 	free_irq(irq, dev_id);
@@ -376,6 +446,14 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)
 }
 EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
 
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+	int irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
+}
+
+
 /*
  * Search the CPUs pending events bitmasks.  For each one found, map
  * the event number to an irq, and feed it into do_IRQ() for
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index 53501ce2d15c..bc49ef846203 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd)
 
 	xen_mc_batch();
 
-	if (pgd_walk(pgd, pin_page, TASK_SIZE))
+	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+		/* re-enable interrupts for kmap_flush_unused */
+		xen_mc_issue(0);
 		kmap_flush_unused();
+		xen_mc_batch();
+	}
 
 	mcs = __xen_mc_entry(sizeof(*op));
 	op = mcs.args;
@@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
-void xen_exit_mmap(struct mm_struct *mm)
-{
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
 
-	/*
-	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-	 */
-	if (tsk->active_mm == mm) {
-		tsk->active_mm = &init_mm;
-		atomic_inc(&init_mm.mm_count);
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+	struct mm_struct *mm = info;
 
-		switch_mm(mm, &init_mm, tsk);
+	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+		leave_mm(smp_processor_id());
+}
 
-		atomic_dec(&mm->mm_count);
-		BUG_ON(atomic_read(&mm->mm_count) == 0);
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm) {
+		if (current->mm == mm)
+			load_cr3(swapper_pg_dir);
+		else
+			leave_mm(smp_processor_id());
 	}
 
-	task_unlock(tsk);
+	if (!cpus_empty(mm->cpu_vm_mask))
+		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+					   mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm)
+		load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+	get_cpu();		/* make sure we don't move around */
+	drop_mm_ref(mm);
+	put_cpu();
 
 	xen_pgd_unpin(mm->pgd);
 }
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
index 49776fe9f02a..c9ff27f3ac3a 100644
--- a/arch/i386/xen/mmu.h
+++ b/arch/i386/xen/mmu.h
@@ -3,6 +3,19 @@
 #include <linux/linkage.h>
 #include <asm/page.h>
 
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 void xen_set_pte(pte_t *ptep, pte_t pteval);
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
index 7da93ee612f6..18a994d5a4c5 100644
--- a/arch/i386/xen/setup.c
+++ b/arch/i386/xen/setup.c
@@ -94,4 +94,9 @@ void __init xen_arch_setup(void)
 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 
 	pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+	/* fill cpus_possible with all available cpus */
+	xen_fill_possible_map();
+#endif
 }
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..a91587fbf5c2
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,407 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_init();
+
+	preempt_disable();
+	per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+	xen_setup_cpu_clockevents();
+
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+
+	wmb();			/* make sure everything is out */
+	cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+	int rc;
+	const char *resched_name, *callfunc_name;
+
+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+				    cpu,
+				    xen_reschedule_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    resched_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(resched_irq, cpu) = rc;
+
+	callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+				    cpu,
+				    xen_call_function_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(callfunc_irq, cpu) = rc;
+
+	return 0;
+
+ fail:
+	if (per_cpu(resched_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	if (per_cpu(callfunc_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+	int i, rc;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0)
+			cpu_set(i, cpu_possible_map);
+	}
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+	int cpu;
+
+	BUG_ON(smp_processor_id() != 0);
+	native_smp_prepare_boot_cpu();
+
+	xen_vcpu_setup(0);
+
+	/* We've switched to the "real" per-cpu gdt, so make sure the
+	   old memory can be recycled */
+	make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	smp_store_cpu_info(0);
+	set_cpu_sibling_map(0);
+
+	if (xen_smp_intr_init(0))
+		BUG();
+
+	cpu_initialized_map = cpumask_of_cpu(0);
+
+	/* Restrict the possible_map according to max_cpus. */
+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+		for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+			continue;
+		cpu_clear(cpu, cpu_possible_map);
+	}
+
+	for_each_possible_cpu (cpu) {
+		struct task_struct *idle;
+
+		if (cpu == 0)
+			continue;
+
+		idle = fork_idle(cpu);
+		if (IS_ERR(idle))
+			panic("failed fork for CPU %d", cpu);
+
+		cpu_set(cpu, cpu_present_map);
+	}
+
+	//init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+	struct vcpu_guest_context *ctxt;
+	struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+	if (cpu_test_and_set(cpu, cpu_initialized_map))
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (ctxt == NULL)
+		return -ENOMEM;
+
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.fs = __KERNEL_PERCPU;
+	ctxt->user_regs.gs = 0;
+	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+	xen_copy_trap_info(ctxt->trap_ctxt);
+
+	ctxt->ldt_ents = 0;
+
+	BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+	make_lowmem_page_readonly(gdt->gdt);
+
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.esp0;
+
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+		BUG();
+
+	kfree(ctxt);
+	return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+	struct task_struct *idle = idle_task(cpu);
+	int rc;
+
+#if 0
+	rc = cpu_up_check(cpu);
+	if (rc)
+		return rc;
+#endif
+
+	init_gdt(cpu);
+	per_cpu(current_task, cpu) = idle;
+	xen_vcpu_setup(cpu);
+	irq_ctx_init(cpu);
+	xen_setup_timer(cpu);
+
+	/* make sure interrupts start blocked */
+	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+	rc = cpu_initialize_context(cpu, idle);
+	if (rc)
+		return rc;
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(1);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc)
+		return rc;
+
+	smp_store_cpu_info(cpu);
+	set_cpu_sibling_map(cpu);
+	/* This must be done before setting cpu_online_map */
+	wmb();
+
+	cpu_set(cpu, cpu_online_map);
+
+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+	int cpu = smp_processor_id();
+
+	/* make sure we're not pinning something down */
+	load_cr3(swapper_pg_dir);
+	/* should set up a minimal gdt */
+
+	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+	BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+	cpumask_t mask = cpu_online_map;
+	cpu_clear(smp_processor_id(), mask);
+	xen_smp_call_function_mask(mask, stop_self, NULL, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+	unsigned cpu;
+
+	cpus_and(mask, mask, cpu_online_map);
+
+	for_each_cpu_mask(cpu, mask)
+		xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+	irq_exit();
+
+	if (wait) {
+		mb();		/* commit everything before setting finished */
+		atomic_inc(&call_data->finished);
+	}
+
+	return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait)
+{
+	struct call_data_struct data;
+	int cpus;
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+
+	cpu_clear(smp_processor_id(), mask);
+
+	cpus = cpus_weight(mask);
+	if (!cpus) {
+		spin_unlock(&call_lock);
+		return 0;
+	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();			/* write everything before IPI */
+
+	/* Send a message to other CPUs and wait for them to respond */
+	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+	/* Make sure other vcpus get a chance to run.
+	   XXX too severe?  Maybe we should check the other CPU's states? */
+	HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus ||
+	       (wait && atomic_read(&data.finished) != cpus))
+		cpu_relax();
+
+	spin_unlock(&call_lock);
+
+	return 0;
+}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index 2aab44bec2a5..aeb04cf5dbf1 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -519,7 +519,7 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 	return ret;
 }
 
-static void xen_setup_timer(int cpu)
+void xen_setup_timer(int cpu)
 {
 	const char *name;
 	struct clock_event_device *evt;
@@ -535,16 +535,20 @@ static void xen_setup_timer(int cpu)
 				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 				      name, NULL);
 
-	evt = &get_cpu_var(xen_clock_events);
+	evt = &per_cpu(xen_clock_events, cpu);
 	memcpy(evt, xen_clockevent, sizeof(*evt));
 
 	evt->cpumask = cpumask_of_cpu(cpu);
 	evt->irq = irq;
-	clockevents_register_device(evt);
 
 	setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+	BUG_ON(preemptible());
 
-	put_cpu_var(xen_clock_events);
+	clockevents_register_device(&__get_cpu_var(xen_clock_events));
 }
 
 __init void xen_time_init(void)
@@ -570,4 +574,5 @@ __init void xen_time_init(void)
 	tsc_disable = 0;
 
 	xen_setup_timer(cpu);
+	xen_setup_cpu_clockevents();
 }
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 7667abd390ec..4069be8ba31f 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -3,6 +3,12 @@
 
 #include <linux/init.h>
 
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 
@@ -13,6 +19,8 @@ char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
 
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
 unsigned long xen_cpu_khz(void);
 void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
@@ -28,5 +36,22 @@ static inline unsigned xen_get_lazy_mode(void)
 	return x86_read_percpu(xen_lazy_mode);
 }
 
+void __init xen_fill_possible_map(void);
+
+void xen_vcpu_setup(int cpu);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			   int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				 int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait);
 
 #endif /* XEN_OPS_H */
diff --git a/include/xen/events.h b/include/xen/events.h
index 77f71c906823..7abe4ddfac54 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -1,15 +1,32 @@
 #ifndef _XEN_EVENTS_H
 #define _XEN_EVENTS_H
 
-#include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include <xen/interface/event_channel.h>
+#include <asm/xen/hypercall.h>
+
+enum ipi_vector {
+	XEN_RESCHEDULE_VECTOR,
+	XEN_CALL_FUNCTION_VECTOR,
+
+	XEN_NR_IPIS,
+};
 
 int bind_evtchn_to_irqhandler(unsigned int evtchn,
-			      irqreturn_t (*handler)(int, void *),
+			      irq_handler_t handler,
 			      unsigned long irqflags, const char *devname,
 			      void *dev_id);
 int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
-			    irqreturn_t (*handler)(int, void *),
-			    unsigned long irqflags, const char *devname, void *dev_id);
+			    irq_handler_t handler,
+			    unsigned long irqflags, const char *devname,
+			    void *dev_id);
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+			   unsigned int cpu,
+			   irq_handler_t handler,
+			   unsigned long irqflags,
+			   const char *devname,
+			   void *dev_id);
 
 /*
  * Common unbind function for all event sources. Takes IRQ to unbind from.
@@ -18,6 +35,8 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
  */
 void unbind_from_irqhandler(unsigned int irq, void *dev_id);
 
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
+
 static inline void notify_remote_via_evtchn(int port)
 {
 	struct evtchn_send send = { .port = port };
-- 
cgit v1.2.3


From f120f13ea0dbb0b0d6675683d5f6faea71277e65 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: Add support for preemption

Add Xen support for preemption.  This is mostly a cleanup of existing
preempt_enable/disable calls, or just comments to explain the current
usage.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/Kconfig      |  2 +-
 arch/i386/xen/enlighten.c  | 80 ++++++++++++++++++++++++++++------------------
 arch/i386/xen/mmu.c        |  3 ++
 arch/i386/xen/multicalls.c | 11 ++++---
 arch/i386/xen/time.c       | 22 ++++++++++---
 5 files changed, 76 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
index b7697ff22361..9df99e1885a4 100644
--- a/arch/i386/xen/Kconfig
+++ b/arch/i386/xen/Kconfig
@@ -4,7 +4,7 @@
 
 config XEN
 	bool "Enable support for Xen hypervisor"
-	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES)
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index de62d66e0893..a1124b7f1d14 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/preempt.h>
+#include <linux/hardirq.h>
 #include <linux/percpu.h>
 #include <linux/delay.h>
 #include <linux/start_kernel.h>
@@ -108,11 +109,10 @@ static unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
-	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
+
 	/* flag has opposite sense of mask */
 	flags = !vcpu->evtchn_upcall_mask;
-	preempt_enable();
 
 	/* convert to IF type flag
 	   -0 -> 0x00000000
@@ -125,32 +125,35 @@ static void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
 
-	preempt_disable();
-
 	/* convert from IF type flag */
 	flags = !(flags & X86_EFLAGS_IF);
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
 	vcpu->evtchn_upcall_mask = flags;
+	preempt_enable_no_resched();
 
-	if (flags == 0) {
-		/* Unmask then check (avoid races).  We're only protecting
-		   against updates by this CPU, so there's no need for
-		   anything stronger. */
-		barrier();
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
 
+	if (flags == 0) {
+		preempt_check_resched();
+		barrier(); /* unmask then check (avoid races) */
 		if (unlikely(vcpu->evtchn_upcall_pending))
 			force_evtchn_callback();
-		preempt_enable();
-	} else
-		preempt_enable_no_resched();
+	}
 }
 
 static void xen_irq_disable(void)
 {
-	struct vcpu_info *vcpu;
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
 	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
-	vcpu->evtchn_upcall_mask = 1;
+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 
@@ -158,18 +161,20 @@ static void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
 
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
 	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
+	preempt_enable_no_resched();
 
-	/* Unmask then check (avoid races).  We're only protecting
-	   against updates by this CPU, so there's no need for
-	   anything stronger. */
-	barrier();
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
 
+	barrier(); /* unmask then check (avoid races) */
 	if (unlikely(vcpu->evtchn_upcall_pending))
 		force_evtchn_callback();
-	preempt_enable();
 }
 
 static void xen_safe_halt(void)
@@ -189,6 +194,8 @@ static void xen_halt(void)
 
 static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
 {
+	BUG_ON(preemptible());
+
 	switch (mode) {
 	case PARAVIRT_LAZY_NONE:
 		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
@@ -293,9 +300,13 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 	xmaddr_t mach_lp = virt_to_machine(lp);
 	u64 entry = (u64)high << 32 | low;
 
+	preempt_disable();
+
 	xen_mc_flush();
 	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
 		BUG();
+
+	preempt_enable();
 }
 
 static int cvt_gate_to_trap(int vector, u32 low, u32 high,
@@ -328,11 +339,13 @@ static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
 static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 				u32 low, u32 high)
 {
-
-	int cpu = smp_processor_id();
 	unsigned long p = (unsigned long)&dt[entrynum];
-	unsigned long start = per_cpu(idt_desc, cpu).address;
-	unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+	unsigned long start, end;
+
+	preempt_disable();
+
+	start = __get_cpu_var(idt_desc).address;
+	end = start + __get_cpu_var(idt_desc).size + 1;
 
 	xen_mc_flush();
 
@@ -347,6 +360,8 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 			if (HYPERVISOR_set_trap_table(info))
 				BUG();
 	}
+
+	preempt_enable();
 }
 
 static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
@@ -368,11 +383,9 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
 
 void xen_copy_trap_info(struct trap_info *traps)
 {
-	const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);
+	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
 
 	xen_convert_trap_info(desc, traps);
-
-	put_cpu_var(idt_desc);
 }
 
 /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
@@ -382,12 +395,11 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
 {
 	static DEFINE_SPINLOCK(lock);
 	static struct trap_info traps[257];
-	int cpu = smp_processor_id();
-
-	per_cpu(idt_desc, cpu) = *desc;
 
 	spin_lock(&lock);
 
+	__get_cpu_var(idt_desc) = *desc;
+
 	xen_convert_trap_info(desc, traps);
 
 	xen_mc_flush();
@@ -402,6 +414,8 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 				u32 low, u32 high)
 {
+	preempt_disable();
+
 	switch ((high >> 8) & 0xff) {
 	case DESCTYPE_LDT:
 	case DESCTYPE_TSS:
@@ -418,10 +432,12 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 	}
 
 	}
+
+	preempt_enable();
 }
 
 static void xen_load_esp0(struct tss_struct *tss,
-				   struct thread_struct *thread)
+			  struct thread_struct *thread)
 {
 	struct multicall_space mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
@@ -525,6 +541,8 @@ static unsigned long xen_read_cr3(void)
 
 static void xen_write_cr3(unsigned long cr3)
 {
+	BUG_ON(preemptible());
+
 	if (cr3 == x86_read_percpu(xen_cr3)) {
 		/* just a simple tlb flush */
 		xen_flush_tlb();
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index bc49ef846203..f431cf14e644 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -38,6 +38,7 @@
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
+#include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
@@ -531,5 +532,7 @@ void xen_exit_mmap(struct mm_struct *mm)
 	drop_mm_ref(mm);
 	put_cpu();
 
+	spin_lock(&mm->page_table_lock);
 	xen_pgd_unpin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
 }
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
index 869f9833f08f..d4015a9ed46c 100644
--- a/arch/i386/xen/multicalls.c
+++ b/arch/i386/xen/multicalls.c
@@ -20,6 +20,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/xen/hypercall.h>
 
@@ -39,10 +40,12 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
 void xen_mc_flush(void)
 {
-	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	int ret = 0;
 	unsigned long flags;
 
+	BUG_ON(preemptible());
+
 	/* Disable interrupts in case someone comes in and queues
 	   something in the middle */
 	local_irq_save(flags);
@@ -60,7 +63,6 @@ void xen_mc_flush(void)
 	} else
 		BUG_ON(b->argidx != 0);
 
-	put_cpu_var(mc_buffer);
 	local_irq_restore(flags);
 
 	BUG_ON(ret);
@@ -68,10 +70,11 @@ void xen_mc_flush(void)
 
 struct multicall_space __xen_mc_entry(size_t args)
 {
-	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	struct multicall_space ret;
 	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
 
+	BUG_ON(preemptible());
 	BUG_ON(argspace > MC_ARGS);
 
 	if (b->mcidx == MC_BATCH ||
@@ -83,7 +86,5 @@ struct multicall_space __xen_mc_entry(size_t args)
 	ret.args = &b->args[b->argidx];
 	b->argidx += argspace;
 
-	put_cpu_var(mc_buffer);
-
 	return ret;
 }
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index aeb04cf5dbf1..51fdabf1fd4d 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -88,7 +88,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 	u64 state_time;
 	struct vcpu_runstate_info *state;
 
-	preempt_disable();
+	BUG_ON(preemptible());
 
 	state = &__get_cpu_var(runstate);
 
@@ -103,8 +103,6 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 		*res = *state;
 		barrier();
 	} while (get64(&state->state_entry_time) != state_time);
-
-	preempt_enable();
 }
 
 static void setup_runstate_info(int cpu)
@@ -179,9 +177,19 @@ static void do_stolen_accounting(void)
 unsigned long long xen_sched_clock(void)
 {
 	struct vcpu_runstate_info state;
-	cycle_t now = xen_clocksource_read();
+	cycle_t now;
+	u64 ret;
 	s64 offset;
 
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	now = xen_clocksource_read();
+
 	get_runstate_snapshot(&state);
 
 	WARN_ON(state.state != RUNSTATE_running);
@@ -190,9 +198,13 @@ unsigned long long xen_sched_clock(void)
 	if (offset < 0)
 		offset = 0;
 
-	return state.time[RUNSTATE_blocked] +
+	ret = state.time[RUNSTATE_blocked] +
 		state.time[RUNSTATE_running] +
 		offset;
+
+	preempt_enable();
+
+	return ret;
 }
 
 
-- 
cgit v1.2.3


From d66bf8fcf3fce058a1cd164a7c8ee6093fdf039c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: lazy-mmu operations

This patch uses the lazy-mmu hooks to batch mmu operations where
possible.  This is primarily useful for batching operations applied to
active pagetables, which happens during mprotect, munmap, mremap and
the like (mmap does not do bulk pagetable operations, so it isn't
helped).

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/enlighten.c  | 48 +++++++++++++++++++++++++++---------------
 arch/i386/xen/mmu.c        | 52 ++++++++++++++++++++++++++++++++++------------
 arch/i386/xen/multicalls.c |  4 ++--
 3 files changed, 72 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index a1124b7f1d14..031dc1dcf819 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -472,28 +472,38 @@ static void xen_apic_write(unsigned long reg, unsigned long val)
 
 static void xen_flush_tlb(void)
 {
-	struct mmuext_op op;
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
-	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
 static void xen_flush_tlb_single(unsigned long addr)
 {
-	struct mmuext_op op;
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
-	op.cmd = MMUEXT_INVLPG_LOCAL;
-	op.arg1.linear_addr = addr & PAGE_MASK;
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
 static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
 				 unsigned long va)
 {
-	struct mmuext_op op;
+	struct {
+		struct mmuext_op op;
+		cpumask_t mask;
+	} *args;
 	cpumask_t cpumask = *cpus;
+	struct multicall_space mcs;
 
 	/*
 	 * A couple of (to be removed) sanity checks:
@@ -510,17 +520,21 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
 	if (cpus_empty(cpumask))
 		return;
 
+	mcs = xen_mc_entry(sizeof(*args));
+	args = mcs.args;
+	args->mask = cpumask;
+	args->op.arg2.vcpumask = &args->mask;
+
 	if (va == TLB_FLUSH_ALL) {
-		op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-		op.arg2.vcpumask = (void *)cpus;
+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
 	} else {
-		op.cmd = MMUEXT_INVLPG_MULTI;
-		op.arg1.linear_addr = va;
-		op.arg2.vcpumask = (void *)cpus;
+		args->op.cmd = MMUEXT_INVLPG_MULTI;
+		args->op.arg1.linear_addr = va;
 	}
 
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
 static unsigned long xen_read_cr2(void)
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index f431cf14e644..4ae038aa6c24 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -98,12 +98,20 @@ void make_lowmem_page_readwrite(void *vaddr)
 
 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
-	struct mmu_update u;
+	struct multicall_space mcs;
+	struct mmu_update *u;
 
-	u.ptr = virt_to_machine(ptr).maddr;
-	u.val = pmd_val_ma(val);
-	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
-		BUG();
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pmd_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
 }
 
 /*
@@ -146,20 +154,38 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
-	if ((mm != current->mm && mm != &init_mm) ||
-	    HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
-		xen_set_pte(ptep, pteval);
+	if (mm == current->mm || mm == &init_mm) {
+		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+			struct multicall_space mcs;
+			mcs = xen_mc_entry(0);
+
+			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+			xen_mc_issue(PARAVIRT_LAZY_MMU);
+			return;
+		} else
+			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+				return;
+	}
+	xen_set_pte(ptep, pteval);
 }
 
 #ifdef CONFIG_X86_PAE
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
-	struct mmu_update u;
+	struct multicall_space mcs;
+	struct mmu_update *u;
 
-	u.ptr = virt_to_machine(ptr).maddr;
-	u.val = pud_val_ma(val);
-	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
-		BUG();
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pud_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
 }
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
index d4015a9ed46c..c837e8e463db 100644
--- a/arch/i386/xen/multicalls.c
+++ b/arch/i386/xen/multicalls.c
@@ -26,8 +26,8 @@
 
 #include "multicalls.h"
 
-#define MC_BATCH	8
-#define MC_ARGS		(MC_BATCH * 32 / sizeof(u64))
+#define MC_BATCH	32
+#define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
 
 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
-- 
cgit v1.2.3


From 8b84ad942b534f8faeb34b68f0f7277ea375fed0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: hack to prevent bad segment register reload

The hypervisor saves and restores the segment registers as part of the
state is saves while context switching.  If, during a context switch,
the next process doesn't use the TLS segments, it invalidates the GDT
entry, causing the segment register reload to fault.  This fault
effectively doubles the cost of a context switch.

This patch is a band-aid workaround which clears the usermode %gs
after it has been saved for the previous process, but before it gets
reloaded for the next, and it avoids having the hypervisor attempt to
erroneously reload it.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/enlighten.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 031dc1dcf819..42756771b8eb 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -291,6 +291,18 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 	load_TLS_descriptor(t, cpu, 2);
 
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+	/*
+	 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+	 * it means we're in a context switch, and %gs has just been
+	 * saved.  This means we can zero it out to prevent faults on
+	 * exit from the hypervisor if the next process has no %gs.
+	 * Either way, it has been saved, and the new value will get
+	 * loaded properly.  This will go away as soon as Xen has been
+	 * modified to not save/restore %gs for normal hypercalls.
+	 */
+	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+		loadsegment(gs, 0);
 }
 
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
-- 
cgit v1.2.3


From b536b4b9623084d86f2b1f19cb44a2d6d74f00bf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: use the hvc console infrastructure for Xen console

Implement a Xen back-end for hvc console.

* * *
Add early printk support via hvc console, enable using
"earlyprintk=xen" on the kernel command line.

From: Gerd Hoffmann <kraxel@suse.de>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Olof Johansson <olof@lixom.net>
---
 arch/i386/xen/events.c            |   3 +-
 arch/x86_64/kernel/early_printk.c |   5 ++
 drivers/char/Kconfig              |   8 ++
 drivers/char/Makefile             |   1 +
 drivers/char/hvc_xen.c            | 159 ++++++++++++++++++++++++++++++++++++++
 include/xen/events.h              |   1 +
 include/xen/hvc-console.h         |   6 ++
 7 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 drivers/char/hvc_xen.c
 create mode 100644 include/xen/hvc-console.h

(limited to 'arch')

diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
index 4103b8bf22fd..8904acc20f8c 100644
--- a/arch/i386/xen/events.c
+++ b/arch/i386/xen/events.c
@@ -244,7 +244,7 @@ static int find_unbound_irq(void)
 	return irq;
 }
 
-static int bind_evtchn_to_irq(unsigned int evtchn)
+int bind_evtchn_to_irq(unsigned int evtchn)
 {
 	int irq;
 
@@ -269,6 +269,7 @@ static int bind_evtchn_to_irq(unsigned int evtchn)
 
 	return irq;
 }
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
 
 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 296d2b0c5d88..fd9aff3f3890 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
+#include <xen/hvc-console.h>
 
 /* Simple VGA output */
 
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
  		simnow_init(buf + 6);
  		early_console = &simnow_console;
  		keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+	} else if (!strncmp(buf, "xen", 3)) {
+		early_console = &xenboot_console;
+#endif
 	}
 
 	if (keep_early)
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 97bd71bc3aea..9e8f21410d2d 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -604,6 +604,14 @@ config HVC_BEAT
 	help
 	  Toshiba's Cell Reference Set Beat Console device driver
 
+config HVC_XEN
+	bool "Xen Hypervisor Console support"
+	depends on XEN
+	select HVC_DRIVER
+	default y
+	help
+	  Xen virtual console device driver
+
 config HVCS
 	tristate "IBM Hypervisor Virtual Console Server support"
 	depends on PPC_PSERIES
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index f2996a95eb07..8852b8d643cf 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
 obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
 obj-$(CONFIG_HVC_BEAT)		+= hvc_beat.o
 obj-$(CONFIG_HVC_DRIVER)	+= hvc_console.o
+obj-$(CONFIG_HVC_XEN)		+= hvc_xen.o
 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
new file mode 100644
index 000000000000..dd68f8541c2d
--- /dev/null
+++ b/drivers/char/hvc_xen.c
@@ -0,0 +1,159 @@
+/*
+ * xen console driver interface to hvc_console.c
+ *
+ * (c) 2007 Gerd Hoffmann <kraxel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/types.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/events.h>
+#include <xen/interface/io/console.h>
+#include <xen/hvc-console.h>
+
+#include "hvc_console.h"
+
+#define HVC_COOKIE   0x58656e /* "Xen" in hex */
+
+static struct hvc_struct *hvc;
+static int xencons_irq;
+
+/* ------------------------------------------------------------------ */
+
+static inline struct xencons_interface *xencons_interface(void)
+{
+	return mfn_to_virt(xen_start_info->console.domU.mfn);
+}
+
+static inline void notify_daemon(void)
+{
+	/* Use evtchn: this is called early, before irq is set up. */
+	notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
+}
+
+static int write_console(uint32_t vtermno, const char *data, int len)
+{
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+	int sent = 0;
+
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	mb();			/* update queue values before going on */
+	BUG_ON((prod - cons) > sizeof(intf->out));
+
+	while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+		intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+
+	wmb();			/* write ring before updating pointer */
+	intf->out_prod = prod;
+
+	notify_daemon();
+	return sent;
+}
+
+static int read_console(uint32_t vtermno, char *buf, int len)
+{
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+	int recv = 0;
+
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	mb();			/* get pointers before reading ring */
+	BUG_ON((prod - cons) > sizeof(intf->in));
+
+	while (cons != prod && recv < len)
+		buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
+
+	mb();			/* read ring before consuming */
+	intf->in_cons = cons;
+
+	notify_daemon();
+	return recv;
+}
+
+static struct hv_ops hvc_ops = {
+	.get_chars = read_console,
+	.put_chars = write_console,
+};
+
+static int __init xen_init(void)
+{
+	struct hvc_struct *hp;
+
+	if (!is_running_on_xen())
+		return 0;
+
+	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+	if (xencons_irq < 0)
+		xencons_irq = 0 /* NO_IRQ */;
+	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
+	if (IS_ERR(hp))
+		return PTR_ERR(hp);
+
+	hvc = hp;
+	return 0;
+}
+
+static void __exit xen_fini(void)
+{
+	if (hvc)
+		hvc_remove(hvc);
+}
+
+static int xen_cons_init(void)
+{
+	if (!is_running_on_xen())
+		return 0;
+
+	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
+	return 0;
+}
+
+module_init(xen_init);
+module_exit(xen_fini);
+console_initcall(xen_cons_init);
+
+static void xenboot_write_console(struct console *console, const char *string,
+				  unsigned len)
+{
+	unsigned int linelen, off = 0;
+	const char *pos;
+
+	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
+		linelen = pos-string+off;
+		if (off + linelen > len)
+			break;
+		write_console(0, string+off, linelen);
+		write_console(0, "\r\n", 2);
+		off += linelen + 1;
+	}
+	if (off < len)
+		write_console(0, string+off, len-off);
+}
+
+struct console xenboot_console = {
+	.name		= "xenboot",
+	.write		= xenboot_write_console,
+	.flags		= CON_PRINTBUFFER | CON_BOOT,
+};
diff --git a/include/xen/events.h b/include/xen/events.h
index 7abe4ddfac54..2bde54d29be5 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -13,6 +13,7 @@ enum ipi_vector {
 	XEN_NR_IPIS,
 };
 
+int bind_evtchn_to_irq(unsigned int evtchn);
 int bind_evtchn_to_irqhandler(unsigned int evtchn,
 			      irq_handler_t handler,
 			      unsigned long irqflags, const char *devname,
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
new file mode 100644
index 000000000000..21c0ecfd786d
--- /dev/null
+++ b/include/xen/hvc-console.h
@@ -0,0 +1,6 @@
+#ifndef XEN_HVC_CONSOLE_H
+#define XEN_HVC_CONSOLE_H
+
+extern struct console xenboot_console;
+
+#endif	/* XEN_HVC_CONSOLE_H */
-- 
cgit v1.2.3


From fefa629abebe328cf6d07f99fe5796dbfc3e4981 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: machine operations

Make the appropriate hypercalls to halt and reboot the virtual machine.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/enlighten.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/smp.c       |  4 +---
 2 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 42756771b8eb..142e74891344 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -30,6 +30,7 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
 
@@ -43,6 +44,7 @@
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
+#include <asm/reboot.h>
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -900,6 +902,45 @@ static const struct smp_ops xen_smp_ops __initdata = {
 };
 #endif	/* CONFIG_SMP */
 
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+		BUG();
+}
+
+static void xen_restart(char *msg)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+	xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+	xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+	.restart = xen_restart,
+	.halt = xen_machine_halt,
+	.power_off = xen_machine_halt,
+	.shutdown = xen_machine_halt,
+	.crash_shutdown = xen_crash_shutdown,
+	.emergency_restart = xen_emergency_restart,
+};
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -912,6 +953,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	paravirt_ops = xen_paravirt_ops;
+	machine_ops = xen_machine_ops;
+
 #ifdef CONFIG_SMP
 	smp_ops = xen_smp_ops;
 #endif
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
index a91587fbf5c2..a620918f87ee 100644
--- a/arch/i386/xen/smp.c
+++ b/arch/i386/xen/smp.c
@@ -311,9 +311,7 @@ static void stop_self(void *v)
 
 void xen_smp_send_stop(void)
 {
-	cpumask_t mask = cpu_online_map;
-	cpu_clear(smp_processor_id(), mask);
-	xen_smp_call_function_mask(mask, stop_self, NULL, 0);
+	smp_call_function(stop_self, NULL, 0, 0);
 }
 
 void xen_smp_send_reschedule(int cpu)
-- 
cgit v1.2.3


From 3e2b8fbeec8f005672f2a2e862fb9c26a0bafedc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: handle external requests for shutdown, reboot and sysrq

The guest domain can be asked to shutdown or reboot itself, or have a
sysrq key injected, via xenbus.  This patch adds a watcher for those
events, and does the appropriate action.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/Makefile |   2 +-
 arch/i386/xen/manage.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/xen/manage.c

(limited to 'arch')

diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index fd05f243a3f8..7bf2ce399a2a 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
-			events.o time.o
+			events.o time.o manage.o
 
 obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	char *str;
+	struct xenbus_transaction xbt;
+	int err;
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (strcmp(str, "poweroff") == 0 ||
+	    strcmp(str, "halt") == 0)
+		orderly_poweroff(false);
+	else if (strcmp(str, "reboot") == 0)
+		ctrl_alt_del();
+	else {
+		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+		shutting_down = SHUTDOWN_INVALID;
+	}
+
+	kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+			  unsigned int len)
+{
+	char sysrq_key = '\0';
+	struct xenbus_transaction xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+		printk(KERN_ERR "Unable to read sysrq code in "
+		       "control/sysrq\n");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0')
+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+	.node = "control/sysrq",
+	.callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+	int err;
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set shutdown watcher\n");
+		return err;
+	}
+
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set sysrq watcher\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *data)
+{
+	setup_shutdown_watcher();
+	return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = shutdown_event
+	};
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
-- 
cgit v1.2.3


From 60223a326fc8fa6e90e2c3fd28ae6de4a311d731 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: Place vcpu_info structure into per-cpu memory

An experimental patch for Xen allows guests to place their vcpu_info
structs anywhere.  We try to use this to place the vcpu_info into the
PDA, which allows direct access.

If this works, then switch to using direct access operations for
irq_enable, disable, save_fl and restore_fl.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Keir Fraser <keir@xensource.com>
---
 arch/i386/xen/enlighten.c    | 152 +++++++++++++++++++++++++++++++++++++++++--
 arch/i386/xen/setup.c        |   8 ---
 arch/i386/xen/smp.c          |   5 +-
 arch/i386/xen/xen-ops.h      |   2 +-
 include/xen/interface/vcpu.h |  13 ++++
 5 files changed, 164 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 142e74891344..e33fa0990eda 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -61,9 +61,63 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
-void xen_vcpu_setup(int cpu)
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
 {
+	struct vcpu_register_vcpu_info info;
+	int err;
+	struct vcpu_info *vcpup;
+
 	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+	if (!have_vcpu_info_placement)
+		return;		/* already tested, not available */
+
+	vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+	info.mfn = virt_to_mfn(vcpup);
+	info.offset = offset_in_page(vcpup);
+
+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+	       cpu, vcpup, info.mfn, info.offset);
+
+	/* Check to see if the hypervisor will put the vcpu_info
+	   structure where we want it, which allows direct access via
+	   a percpu-variable. */
+	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+	if (err) {
+		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+		have_vcpu_info_placement = 0;
+	} else {
+		/* This cpu is using the registered vcpu info, even if
+		   later ones fail to. */
+		per_cpu(xen_vcpu, cpu) = vcpup;
+		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+		       cpu, vcpup);
+	}
 }
 
 static void __init xen_banner(void)
@@ -123,6 +177,20 @@ static unsigned long xen_save_fl(void)
 	return (-flags) & X86_EFLAGS_IF;
 }
 
+static unsigned long xen_save_fl_direct(void)
+{
+	unsigned long flags;
+
+	/* flag has opposite sense of mask */
+	flags = !x86_read_percpu(xen_vcpu_info.evtchn_upcall_mask);
+
+	/* convert to IF type flag
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
 static void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
@@ -149,6 +217,25 @@ static void xen_restore_fl(unsigned long flags)
 	}
 }
 
+static void xen_restore_fl_direct(unsigned long flags)
+{
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+
+	/* This is an atomic update, so no need to worry about
+	   preemption. */
+	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, flags);
+
+	/* If we get preempted here, then any pending event will be
+	   handled anyway. */
+
+	if (flags == 0) {
+		barrier(); /* unmask then check (avoid races) */
+		if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending)))
+			force_evtchn_callback();
+	}
+}
+
 static void xen_irq_disable(void)
 {
 	/* There's a one instruction preempt window here.  We need to
@@ -159,6 +246,12 @@ static void xen_irq_disable(void)
 	preempt_enable_no_resched();
 }
 
+static void xen_irq_disable_direct(void)
+{
+	/* Atomic update, so preemption not a concern. */
+	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 1);
+}
+
 static void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
@@ -179,6 +272,19 @@ static void xen_irq_enable(void)
 		force_evtchn_callback();
 }
 
+static void xen_irq_enable_direct(void)
+{
+	/* Atomic update, so preemption not a concern. */
+	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 0);
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	barrier(); /* unmask then check (avoid races) */
+	if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending)))
+		force_evtchn_callback();
+}
+
 static void xen_safe_halt(void)
 {
 	/* Blocking includes an implicit local_irq_enable(). */
@@ -551,11 +657,21 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
+static void xen_write_cr2(unsigned long cr2)
+{
+	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
 static unsigned long xen_read_cr2(void)
 {
 	return x86_read_percpu(xen_vcpu)->arch.cr2;
 }
 
+static unsigned long xen_read_cr2_direct(void)
+{
+	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
 static void xen_write_cr4(unsigned long cr4)
 {
 	/* never allow TSC to be disabled */
@@ -753,8 +869,27 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
 			BUG();
 	}
+}
 
-	xen_vcpu_setup(smp_processor_id());
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		xen_vcpu_setup(cpu);
+
+	/* xen_vcpu_setup managed to place the vcpu_info within the
+	   percpu area for all cpus, so make use of it */
+	if (have_vcpu_info_placement) {
+		printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+		paravirt_ops.save_fl = xen_save_fl_direct;
+		paravirt_ops.restore_fl = xen_restore_fl_direct;
+		paravirt_ops.irq_disable = xen_irq_disable_direct;
+		paravirt_ops.irq_enable = xen_irq_enable_direct;
+		paravirt_ops.read_cr2 = xen_read_cr2_direct;
+	}
 }
 
 static const struct paravirt_ops xen_paravirt_ops __initdata = {
@@ -788,7 +923,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.write_cr0 = native_write_cr0,
 
 	.read_cr2 = xen_read_cr2,
-	.write_cr2 = native_write_cr2,
+	.write_cr2 = xen_write_cr2,
 
 	.read_cr3 = xen_read_cr3,
 	.write_cr3 = xen_write_cr3,
@@ -974,7 +1109,16 @@ asmlinkage void __init xen_start_kernel(void)
 	/* keep using Xen gdt for now; no urgent need to change it */
 
 	x86_write_percpu(xen_cr3, __pa(pgd));
-	xen_vcpu_setup(0);
+
+#ifdef CONFIG_SMP
+	/* Don't do the full vcpu_info placement stuff until we have a
+	   possible map. */
+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+	/* May as well do it now, since there's no good time to call
+	   it later on UP. */
+	xen_setup_vcpu_info_placement();
+#endif
 
 	paravirt_ops.kernel_rpl = 1;
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
index 18a994d5a4c5..3f8684eba62b 100644
--- a/arch/i386/xen/setup.c
+++ b/arch/i386/xen/setup.c
@@ -24,14 +24,6 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
-static __initdata struct shared_info init_shared;
-
-/*
- * Point at some empty memory to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-struct shared_info *HYPERVISOR_shared_info = &init_shared;
-
 unsigned long *phys_to_machine_mapping;
 EXPORT_SYMBOL(phys_to_machine_mapping);
 
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
index a620918f87ee..557b8e24706a 100644
--- a/arch/i386/xen/smp.c
+++ b/arch/i386/xen/smp.c
@@ -142,8 +142,6 @@ void __init xen_smp_prepare_boot_cpu(void)
 	BUG_ON(smp_processor_id() != 0);
 	native_smp_prepare_boot_cpu();
 
-	xen_vcpu_setup(0);
-
 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	   old memory can be recycled */
 	make_lowmem_page_readwrite(&per_cpu__gdt_page);
@@ -152,6 +150,8 @@ void __init xen_smp_prepare_boot_cpu(void)
 		cpus_clear(cpu_sibling_map[cpu]);
 		cpus_clear(cpu_core_map[cpu]);
 	}
+
+	xen_setup_vcpu_info_placement();
 }
 
 void __init xen_smp_prepare_cpus(unsigned int max_cpus)
@@ -262,7 +262,6 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
 
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
-	xen_vcpu_setup(cpu);
 	irq_ctx_init(cpu);
 	xen_setup_timer(cpu);
 
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 4069be8ba31f..5b56f7fecd19 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -38,7 +38,7 @@ static inline unsigned xen_get_lazy_mode(void)
 
 void __init xen_fill_possible_map(void);
 
-void xen_vcpu_setup(int cpu);
+void __init xen_setup_vcpu_info_placement(void);
 void xen_smp_prepare_boot_cpu(void);
 void xen_smp_prepare_cpus(unsigned int max_cpus);
 int xen_cpu_up(unsigned int cpu);
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index c6218f1ad3ca..ff61ea365997 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -151,4 +151,17 @@ struct vcpu_set_singleshot_timer {
 #define _VCPU_SSHOTTMR_future (0)
 #define VCPU_SSHOTTMR_future  (1U << _VCPU_SSHOTTMR_future)
 
+/*
+ * Register a memory location in the guest address space for the
+ * vcpu_info structure.  This allows the guest to place the vcpu_info
+ * structure in a convenient place, such as in a per-cpu data area.
+ * The pointer need not be page aligned, but the structure must not
+ * cross a page boundary.
+ */
+#define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
+struct vcpu_register_vcpu_info {
+    uint32_t mfn;               /* mfn of page to place vcpu_info */
+    uint32_t offset;            /* offset within page */
+};
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
-- 
cgit v1.2.3


From 6487673b8a858f99a5348e1078b3f5aec700f9e0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: Attempt to patch inline versions of common operations

This patchs adds the mechanism to allow us to patch inline versions of
common operations.

The implementations of the direct-access versions save_fl, restore_fl,
irq_enable and irq_disable are now in assembler, and the same code is
used for both out of line and inline uses.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Keir Fraser <keir@xensource.com>
---
 arch/i386/kernel/asm-offsets.c |   8 +++
 arch/i386/xen/Makefile         |   2 +-
 arch/i386/xen/enlighten.c      | 107 +++++++++++++++++++-------------------
 arch/i386/xen/xen-asm.S        | 114 +++++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/xen-ops.h        |  13 +++++
 5 files changed, 190 insertions(+), 54 deletions(-)
 create mode 100644 arch/i386/xen/xen-asm.S

(limited to 'arch')

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044d..a7c2947b3966 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
 
+#include <xen/interface/xen.h>
+
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -115,4 +117,10 @@ void foo(void)
 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
 }
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
index 7bf2ce399a2a..343df246bd3e 100644
--- a/arch/i386/xen/Makefile
+++ b/arch/i386/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
-			events.o time.o manage.o
+			events.o time.o manage.o xen-asm.o
 
 obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index e33fa0990eda..4fa62a4cb7cc 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -115,6 +115,7 @@ static void __init xen_vcpu_setup(int cpu)
 		/* This cpu is using the registered vcpu info, even if
 		   later ones fail to. */
 		per_cpu(xen_vcpu, cpu) = vcpup;
+
 		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
 		       cpu, vcpup);
 	}
@@ -177,20 +178,6 @@ static unsigned long xen_save_fl(void)
 	return (-flags) & X86_EFLAGS_IF;
 }
 
-static unsigned long xen_save_fl_direct(void)
-{
-	unsigned long flags;
-
-	/* flag has opposite sense of mask */
-	flags = !x86_read_percpu(xen_vcpu_info.evtchn_upcall_mask);
-
-	/* convert to IF type flag
-	   -0 -> 0x00000000
-	   -1 -> 0xffffffff
-	*/
-	return (-flags) & X86_EFLAGS_IF;
-}
-
 static void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
@@ -217,25 +204,6 @@ static void xen_restore_fl(unsigned long flags)
 	}
 }
 
-static void xen_restore_fl_direct(unsigned long flags)
-{
-	/* convert from IF type flag */
-	flags = !(flags & X86_EFLAGS_IF);
-
-	/* This is an atomic update, so no need to worry about
-	   preemption. */
-	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, flags);
-
-	/* If we get preempted here, then any pending event will be
-	   handled anyway. */
-
-	if (flags == 0) {
-		barrier(); /* unmask then check (avoid races) */
-		if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending)))
-			force_evtchn_callback();
-	}
-}
-
 static void xen_irq_disable(void)
 {
 	/* There's a one instruction preempt window here.  We need to
@@ -246,12 +214,6 @@ static void xen_irq_disable(void)
 	preempt_enable_no_resched();
 }
 
-static void xen_irq_disable_direct(void)
-{
-	/* Atomic update, so preemption not a concern. */
-	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 1);
-}
-
 static void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
@@ -272,19 +234,6 @@ static void xen_irq_enable(void)
 		force_evtchn_callback();
 }
 
-static void xen_irq_enable_direct(void)
-{
-	/* Atomic update, so preemption not a concern. */
-	x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 0);
-
-	/* Doesn't matter if we get preempted here, because any
-	   pending event will get dealt with anyway. */
-
-	barrier(); /* unmask then check (avoid races) */
-	if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending)))
-		force_evtchn_callback();
-}
-
 static void xen_safe_halt(void)
 {
 	/* Blocking includes an implicit local_irq_enable(). */
@@ -892,6 +841,57 @@ void __init xen_setup_vcpu_info_placement(void)
 	}
 }
 
+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	char *start, *end, *reloc;
+	unsigned ret;
+
+	start = end = reloc = NULL;
+
+#define SITE(x)								\
+	case PARAVIRT_PATCH(x):						\
+	if (have_vcpu_info_placement) {					\
+		start = (char *)xen_##x##_direct;			\
+		end = xen_##x##_direct_end;				\
+		reloc = xen_##x##_direct_reloc;				\
+	}								\
+	goto patch_site
+
+	switch (type) {
+		SITE(irq_enable);
+		SITE(irq_disable);
+		SITE(save_fl);
+		SITE(restore_fl);
+#undef SITE
+
+	patch_site:
+		if (start == NULL || (end-start) > len)
+			goto default_patch;
+
+		ret = paravirt_patch_insns(insns, len, start, end);
+
+		/* Note: because reloc is assigned from something that
+		   appears to be an array, gcc assumes it's non-null,
+		   but doesn't know its relationship with start and
+		   end. */
+		if (reloc > start && reloc < end) {
+			int reloc_off = reloc - start;
+			long *relocp = (long *)(insns + reloc_off);
+			long delta = start - (char *)insns;
+
+			*relocp += delta;
+		}
+		break;
+
+	default_patch:
+	default:
+		ret = paravirt_patch_default(type, clobbers, insns, len);
+		break;
+	}
+
+	return ret;
+}
+
 static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
@@ -899,7 +899,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.name = "Xen",
 	.banner = xen_banner,
 
-	.patch = paravirt_patch_default,
+	.patch = xen_patch,
 
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
@@ -1076,6 +1076,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
 	.emergency_restart = xen_emergency_restart,
 };
 
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..dc4d36d51bc1
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,114 @@
+/*
+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+	The inline versions are the same as the direct-use versions, with the
+	pre- and post-amble chopped off.
+
+	This code is encoded for size rather than absolute efficiency,
+	with a view to being able to inline as much as possible.
+
+	We only bother with direct forms (ie, vcpu in pda) of the operations
+	here; the indirect forms are better handled in C, since they're
+	generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+
+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)	.globl x##_end; x##_end=.
+
+/*
+	Enable events.  This clears the event mask and tests the pending
+	event status with one and operation.  If there are pending
+	events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+	/* Clear mask and test pending */
+	andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+	ret
+	ENDPROC(xen_irq_enable_direct)
+	RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+	Disabling events is simply a matter of making the event mask
+	non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+	ret
+	ENDPROC(xen_irq_disable_direct)
+	RELOC(xen_irq_disable_direct, 0)
+
+/*
+	(xen_)save_fl is used to get the current interrupt enable status.
+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
+	may be set in the return value.  We take advantage of this by
+	making sure that X86_EFLAGS_IF has the right value (and other bits
+	in that byte are 0), but other bits in the return value are
+	undefined.  We need to toggle the state of the bit, because
+	Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	setz %ah
+	addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+	ret
+	ENDPROC(xen_save_fl_direct)
+	RELOC(xen_save_fl_direct, 0)
+
+
+/*
+	In principle the caller should be passing us a value return
+	from xen_save_fl_direct, but for robustness sake we test only
+	the X86_EFLAGS_IF flag rather than the whole byte. After
+	setting the interrupt mask state, it checks for unmasked
+	pending events and enters the hypervisor to get them delivered
+	if so.
+ */
+ENTRY(xen_restore_fl_direct)
+	testb $X86_EFLAGS_IF>>8, %ah
+	setz %al
+	movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* check for pending but unmasked */
+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+	ret
+	ENDPROC(xen_restore_fl_direct)
+	RELOC(xen_restore_fl_direct, 2b+1)
+
+
+
+/*
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
+ */
+check_events:
+	push %eax
+	push %ecx
+	push %edx
+	call force_evtchn_callback
+	pop %edx
+	pop %ecx
+	pop %eax
+	ret
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 5b56f7fecd19..33e4c8a16289 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -54,4 +54,17 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait);
 
+
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)		\
+	ret name(__VA_ARGS__);			\
+	extern char name##_end[];		\
+	extern char name##_reloc[]		\
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
 #endif /* XEN_OPS_H */
-- 
cgit v1.2.3


From 600b2fc242992e552e0b4e24c8c1f084b341f39b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: suppress abs symbol warnings for unused reloc pointers

arch/i386/xen/xen-asm.S defines some small pieces of code which are
used to implement a few paravirt_ops.  They're designed so they can be
used either in-place, or be inline patched into their callsites if
there's enough space.

Some of those operations need to make calls out (specifically, if you
re-enable events [interrupts], and there's a pending event at that
time).  These calls need the call instruction to be relocated if the
code is patched inline.  In this case xen_foo_reloc is a
section-relative symbol which points to xen_foo's required relocation.

Other operations have no need of a relocation, and so their
corresponding xen_bar_reloc is absolute 0.  These are the cases which
are triggering the warning.

This patch adds those symbols to the list of safe abs symbols.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Adrian Bunk <bunk@stusta.de>
---
 arch/i386/boot/compressed/relocs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aaf..b0e21c3cee5c 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
 		"__kernel_rt_sigreturn",
 		"__kernel_sigreturn",
 		"SYSENTER_RETURN",
+		"xen_irq_disable_direct_reloc",
+		"xen_save_fl_direct_reloc",
 };
 
 static int is_safe_abs_reloc(const char* sym_name)
-- 
cgit v1.2.3


From 9ec2b804e099e8a326369e6cccab10dee1d172ee Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: use iret directly when possible

Most of the time we can simply use the iret instruction to exit the
kernel, rather than having to use the iret hypercall - the only
exception is if we're returning into vm86 mode, or from delivering an
NMI (which we don't support yet).

When running native, iret has the behaviour of testing for a pending
interrupt atomically with re-enabling interrupts.  Unfortunately
there's no way to do this with Xen, so there's a window in which we
could get a recursive exception after enabling events but before
actually returning to userspace.

This causes a problem: if the nested interrupt causes one of the
task's TIF_WORK_MASK flags to be set, they will not be checked again
before returning to userspace.  This means that pending work may be
left pending indefinitely, until the process enters and leaves the
kernel again.  The net effect is that a pending signal or reschedule
event could be delayed for an unbounded amount of time.

To deal with this, the xen event upcall handler checks to see if the
EIP is within the critical section of the iret code, after events
are (potentially) enabled up to the iret itself.  If its within this
range, it calls the iret critical section fixup, which adjusts the
stack to deal with any unrestored registers, and then shifts the
stack frame up to replace the previous invocation.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
---
 arch/i386/kernel/asm-offsets.c |   1 +
 arch/i386/kernel/entry.S       |  16 +++-
 arch/i386/xen/enlighten.c      |   1 +
 arch/i386/xen/xen-asm.S        | 185 ++++++++++++++++++++++++++++++++++++++++-
 arch/i386/xen/xen-ops.h        |   1 +
 5 files changed, 199 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index a7c2947b3966..25f7eb513928 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -61,6 +61,7 @@ void foo(void)
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+	OFFSET(TI_cpu, thread_info, cpu);
 	BLANK();
 
 	OFFSET(GDS_size, Xgt_desc_struct, size);
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index ffb236544270..32980b834935 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1030,7 +1030,21 @@ ENTRY(xen_hypervisor_callback)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	TRACE_IRQS_OFF
-	mov %esp, %eax
+
+	/* Check to see if we got the event in the critical
+	   region in xen_iret_direct, after we've reenabled
+	   events and checked for pending events.  This simulates
+	   iret instruction's behaviour where it delivers a
+	   pending interrupt when enabling interrupts. */
+	movl PT_EIP(%esp),%eax
+	cmpl $xen_iret_start_crit,%eax
+	jb   1f
+	cmpl $xen_iret_end_crit,%eax
+	jae  1f
+
+	call xen_iret_crit_fixup
+
+1:	mov %esp, %eax
 	call xen_evtchn_do_upcall
 	jmp  ret_from_intr
 	CFI_ENDPROC
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index 4fa62a4cb7cc..9a8c1181c001 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -838,6 +838,7 @@ void __init xen_setup_vcpu_info_placement(void)
 		paravirt_ops.irq_disable = xen_irq_disable_direct;
 		paravirt_ops.irq_enable = xen_irq_enable_direct;
 		paravirt_ops.read_cr2 = xen_read_cr2_direct;
+		paravirt_ops.iret = xen_iret_direct;
 	}
 }
 
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
index dc4d36d51bc1..1a43b60c0c62 100644
--- a/arch/i386/xen/xen-asm.S
+++ b/arch/i386/xen/xen-asm.S
@@ -12,15 +12,21 @@
  */
 
 #include <linux/linkage.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/percpu.h>
-#include <asm/asm-offsets.h>
 #include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
 
 #define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
 #define ENDPATCH(x)	.globl x##_end; x##_end=.
 
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI	0x80000000
+
 /*
 	Enable events.  This clears the event mask and tests the pending
 	event status with one and operation.  If there are pending
@@ -81,13 +87,12 @@ ENDPATCH(xen_save_fl_direct)
  */
 ENTRY(xen_restore_fl_direct)
 	testb $X86_EFLAGS_IF>>8, %ah
-	setz %al
-	movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
 	/* Preempt here doesn't matter because that will deal with
 	   any pending interrupts.  The pending check may end up being
 	   run on the wrong CPU, but that doesn't hurt. */
 
-	/* check for pending but unmasked */
+	/* check for unmasked and pending */
 	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
 	jz 1f
 2:	call check_events
@@ -97,6 +102,178 @@ ENDPATCH(xen_restore_fl_direct)
 	ENDPROC(xen_restore_fl_direct)
 	RELOC(xen_restore_fl_direct, 2b+1)
 
+/*
+	This is run where a normal iret would be run, with the same stack setup:
+	      8: eflags
+	      4: cs
+	esp-> 0: eip
+
+	This attempts to make sure that any pending events are dealt
+	with on return to usermode, but there is a small window in
+	which an event can happen just before entering usermode.  If
+	the nested interrupt ends up setting one of the TIF_WORK_MASK
+	pending work flags, they will not be tested again before
+	returning to usermode. This means that a process can end up
+	with pending work, which will be unprocessed until the process
+	enters and leaves the kernel again, which could be an
+	unbounded amount of time.  This means that a pending signal or
+	reschedule event could be indefinitely delayed.
+
+	The fix is to notice a nested interrupt in the critical
+	window, and if one occurs, then fold the nested interrupt into
+	the current interrupt stack frame, and re-process it
+	iteratively rather than recursively.  This means that it will
+	exit via the normal path, and all pending work will be dealt
+	with appropriately.
+
+	Because the nested interrupt handler needs to deal with the
+	current stack state in whatever form its in, we keep things
+	simple by only using a single register which is pushed/popped
+	on the stack.
+
+	Non-direct iret could be done in the same way, but it would
+	require an annoying amount of code duplication.  We'll assume
+	that direct mode will be the common case once the hypervisor
+	support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+	/* test eflags for special cases */
+	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+	jnz hyper_iret
+
+	push %eax
+	ESP_OFFSET=4	# bytes pushed onto stack
+
+	/* Store vcpu_info pointer for easy access.  Do it this
+	   way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+	GET_THREAD_INFO(%eax)
+	movl TI_cpu(%eax),%eax
+	movl __per_cpu_offset(,%eax,4),%eax
+	lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+	movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+	/* check IF state we're restoring */
+	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+	/* Maybe enable events.  Once this happens we could get a
+	   recursive event, so the critical region starts immediately
+	   afterwards.  However, if that happens we don't end up
+	   resuming the code, so we don't have to be worried about
+	   being preempted to another CPU. */
+	setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+	/* If there's something pending, mask events again so we
+	   can jump back into xen_hypervisor_callback */
+	sete XEN_vcpu_info_mask(%eax)
+
+	popl %eax
+
+	/* From this point on the registers are restored and the stack
+	   updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+	/* Jump to hypervisor_callback after fixing up the stack.
+	   Events are masked, so jumping out of the critical
+	   region is OK. */
+	je xen_hypervisor_callback
+
+	iret
+xen_iret_end_crit:
+
+hyper_iret:
+	/* put this out of line since its very rarely used */
+	jmp hypercall_page + __HYPERVISOR_iret * 32
+
+	.globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+	----------------
+	 ss		: (ss/esp may be present if we came from usermode)
+	 esp		:
+	 eflags		}  outer exception info
+	 cs		}
+	 eip		}
+	---------------- <- edi (copy dest)
+	 eax		:  outer eax if it hasn't been restored
+	----------------
+	 eflags		}  nested exception info
+	 cs		}   (no ss/esp because we're nested
+	 eip		}    from the same ring)
+	 orig_eax	}<- esi (copy src)
+	 - - - - - - - -
+	 fs		}
+	 es		}
+	 ds		}  SAVE_ALL state
+	 eax		}
+	  :		:
+	 ebx		}
+	----------------
+	 return addr	 <- esp
+	----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+	/* offsets +4 for return address */
+
+	/*
+	   Paranoia: Make sure we're really coming from userspace.
+	   One could imagine a case where userspace jumps into the
+	   critical range address, but just before the CPU delivers a GP,
+	   it decides to deliver an interrupt instead.  Unlikely?
+	   Definitely.  Easy to avoid?  Yes.  The Intel documents
+	   explicitly say that the reported EIP for a bad jump is the
+	   jump instruction itself, not the destination, but some virtual
+	   environments get this wrong.
+	 */
+	movl PT_CS+4(%esp), %ecx
+	andl $SEGMENT_RPL_MASK, %ecx
+	cmpl $USER_RPL, %ecx
+	je 2f
+
+	lea PT_ORIG_EAX+4(%esp), %esi
+	lea PT_EFLAGS+4(%esp), %edi
+
+	/* If eip is before iret_restore_end then stack
+	   hasn't been restored yet. */
+	cmp $iret_restore_end, %eax
+	jae 1f
+
+	movl 0+4(%edi),%eax		/* copy EAX */
+	movl %eax, PT_EAX+4(%esp)
+
+	lea ESP_OFFSET(%edi),%edi	/* move dest up over saved regs */
+
+	/* set up the copy */
+1:	std
+	mov $(PT_EIP+4) / 4, %ecx	/* copy ret+saved regs up to orig_eax */
+	rep movsl
+	cld
+
+	lea 4(%edi),%esp		/* point esp to new frame */
+2:	ret
 
 
 /*
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
index 33e4c8a16289..b9aaea45f07f 100644
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -67,4 +67,5 @@ DECL_ASM(void, xen_irq_disable_direct, void);
 DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
+void xen_iret_direct(void);
 #endif /* XEN_OPS_H */
-- 
cgit v1.2.3


From dfdcdd42fdf63452ddd1bed6f49ae2a35dfb5d6c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:07 -0700
Subject: xen: disable all non-virtual drivers

A domU Xen environment has no non-virtual drivers, so make sure
they're all disabled at once.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/xen/setup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
index 3f8684eba62b..2fe6eac510f0 100644
--- a/arch/i386/xen/setup.c
+++ b/arch/i386/xen/setup.c
@@ -91,4 +91,6 @@ void __init xen_arch_setup(void)
 	/* fill cpus_possible with all available cpus */
 	xen_fill_possible_map();
 #endif
+
+	paravirt_disable_iospace();
 }
-- 
cgit v1.2.3