From 6f0e8bf16730a36ff6773802d8c8df56d10e60cd Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 27 Oct 2017 19:49:37 +0200
Subject: xen: support 52 bit physical addresses in pv guests

Physical addresses on processors supporting 5 level paging can be up to
52 bits wide. For a Xen pv guest running on such a machine those
physical addresses have to be supported in order to be able to use any
memory on the machine even if the guest itself does not support 5 level
paging.

So when reading/writing a MFN from/to a pte don't use the kernel's
PTE_PFN_MASK but a new XEN_PTE_MFN_MASK allowing full 40 bit wide MFNs.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/page.h | 11 ++++++++++-
 arch/x86/xen/mmu_pv.c           |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 07b6531813c4..90e91003fd9d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -26,6 +26,15 @@ typedef struct xpaddr {
 	phys_addr_t paddr;
 } xpaddr_t;
 
+#ifdef CONFIG_X86_64
+#define XEN_PHYSICAL_MASK	__sme_clr((1UL << 52) - 1)
+#else
+#define XEN_PHYSICAL_MASK	__PHYSICAL_MASK
+#endif
+
+#define XEN_PTE_MFN_MASK	((pteval_t)(((signed long)PAGE_MASK) & \
+					    XEN_PHYSICAL_MASK))
+
 #define XMADDR(x)	((xmaddr_t) { .maddr = (x) })
 #define XPADDR(x)	((xpaddr_t) { .paddr = (x) })
 
@@ -277,7 +286,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 
 static inline unsigned long pte_mfn(pte_t pte)
 {
-	return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+	return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
 }
 
 static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 71495f1a86d7..9d9cc3870722 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 static pteval_t pte_mfn_to_pfn(pteval_t val)
 {
 	if (val & _PAGE_PRESENT) {
-		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+		unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
 		unsigned long pfn = mfn_to_pfn(mfn);
 
 		pteval_t flags = val & PTE_FLAGS_MASK;
@@ -1735,7 +1735,7 @@ static unsigned long __init m2p(phys_addr_t maddr)
 {
 	phys_addr_t paddr;
 
-	maddr &= PTE_PFN_MASK;
+	maddr &= XEN_PTE_MFN_MASK;
 	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
 
 	return paddr;
-- 
cgit v1.2.3


From b5494ad83fb52a8e5a7dc1d30cb42cbca5d617f1 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 2 Nov 2017 18:18:03 -0400
Subject: xen/time: Return -ENODEV from xen_get_wallclock()

For any other error sync_cmos_clock() will reschedule itself
every second or so, for no good reason.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1ecb05db3632..244791ff8201 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -74,7 +74,7 @@ static void xen_get_wallclock(struct timespec *now)
 
 static int xen_set_wallclock(const struct timespec *now)
 {
-	return -1;
+	return -ENODEV;
 }
 
 static int xen_pvclock_gtod_notify(struct notifier_block *nb,
-- 
cgit v1.2.3


From ec4001c3f29ebb3d4147aaec7be9c687ddadb7c8 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Fri, 3 Nov 2017 17:04:11 +0000
Subject: xen: support priv-mapping in an HVM tools domain

If the domain has XENFEAT_auto_translated_physmap then use of the PV-
specific HYPERVISOR_mmu_update hypercall is clearly incorrect.

This patch adds checks in xen_remap_domain_gfn_array() and
xen_unmap_domain_gfn_array() which call through to the approprate
xlate_mmu function if the feature is present. A check is also added
to xen_remap_domain_gfn_range() to fail with -EOPNOTSUPP since this
should not be used in an HVM tools domain.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu.c    | 14 ++++++++++++--
 include/xen/xen-ops.h | 24 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3e15345abfe7..d33e7dbe3129 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -EOPNOTSUPP;
+
 	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
@@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid, struct page **pages)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
+						 prot, domid, pages);
+
 	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
 	 * and the consequences later is quite hard to detect what the actual
 	 * cause of "wrong memory was mapped in".
@@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 /* Returns: 0 success */
 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
-			       int numpgs, struct page **pages)
+			       int nr, struct page **pages)
 {
-	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return xen_xlate_unmap_gfn_range(vma, nr, pages);
+
+	if (!pages)
 		return 0;
 
 	return -EINVAL;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 09072271f122..c278b21cad39 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -104,6 +104,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       struct page **pages);
 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages);
+
+#ifdef CONFIG_XEN_AUTO_XLATE
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
 			      xen_pfn_t *gfn, int nr,
@@ -112,6 +114,28 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      struct page **pages);
 int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
 			      int nr, struct page **pages);
+#else
+/*
+ * These two functions are called from arch/x86/xen/mmu.c and so stubs
+ * are needed for a configuration not specifying CONFIG_XEN_AUTO_XLATE.
+ */
+static inline int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
+					    unsigned long addr,
+					    xen_pfn_t *gfn, int nr,
+					    int *err_ptr, pgprot_t prot,
+					    unsigned int domid,
+					    struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+					    int nr, struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr,
 				  unsigned long nr_grant_frames);
 
-- 
cgit v1.2.3


From b988b8ff072ab04abd62d10d3fe44ec544be8a7d Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Nov 2017 10:19:17 +0100
Subject: xen: re-introduce support for grant v2 interface

The grant v2 support was removed from the kernel with
commit 438b33c7145ca8a5131a30c36d8f59bce119a19a ("xen/grant-table:
remove support for V2 tables") as the higher memory footprint of v2
grants resulted in less grants being possible for a kernel compared
to the v1 grant interface.

As machines with more than 16TB of memory are expected to be more
common in the near future support of grant v2 is mandatory in order
to be able to run a Xen pv domain at any memory location.

So re-add grant v2 support basically by reverting above commit.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/arm/xen/grant-table.c |   9 +-
 arch/x86/xen/grant-table.c |  60 ++++++++-
 drivers/xen/grant-table.c  | 312 ++++++++++++++++++++++++++++++++++++++++++++-
 include/xen/grant_table.h  |  30 ++++-
 4 files changed, 398 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/xen/grant-table.c b/arch/arm/xen/grant-table.c
index e43791829ace..91cf08ba1e95 100644
--- a/arch/arm/xen/grant-table.c
+++ b/arch/arm/xen/grant-table.c
@@ -45,7 +45,14 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 	return;
 }
 
-int arch_gnttab_init(unsigned long nr_shared)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared)
+{
+	return -ENOSYS;
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
 {
 	return 0;
 }
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 809b6c812654..92ccc718152d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -49,7 +49,7 @@
 static struct gnttab_vm_area {
 	struct vm_struct *area;
 	pte_t **ptes;
-} gnttab_shared_vm_area;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
 
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
@@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 	return 0;
 }
 
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared)
+{
+	grant_status_t *shared = *__shared;
+	unsigned long addr;
+	unsigned long i;
+
+	if (shared == NULL)
+		*__shared = shared = gnttab_status_vm_area.area->addr;
+
+	addr = (unsigned long)shared;
+
+	for (i = 0; i < nr_gframes; i++) {
+		set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+			   mfn_pte(frames[i], PAGE_KERNEL));
+		addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
 void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 {
+	pte_t **ptes;
 	unsigned long addr;
 	unsigned long i;
 
+	if (shared == gnttab_status_vm_area.area->addr)
+		ptes = gnttab_status_vm_area.ptes;
+	else
+		ptes = gnttab_shared_vm_area.ptes;
+
 	addr = (unsigned long)shared;
 
 	for (i = 0; i < nr_gframes; i++) {
-		set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
-			   __pte(0));
+		set_pte_at(&init_mm, addr, ptes[i], __pte(0));
 		addr += PAGE_SIZE;
 	}
 }
@@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
 	return 0;
 }
 
-int arch_gnttab_init(unsigned long nr_shared)
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
 {
+	free_vm_area(area->area);
+	kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+	int ret;
+
 	if (!xen_pv_domain())
 		return 0;
 
-	return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+	ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Always allocate the space for the status frames in case
+	 * we're migrated to a host with V2 support.
+	 */
+	ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+	if (ret < 0)
+		goto err;
+
+	return 0;
+err:
+	arch_gnttab_vfree(&gnttab_shared_vm_area);
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_XEN_PVH
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 2c6a9114d332..65c4bdb0b463 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -71,6 +71,7 @@ struct grant_frames xen_auto_xlat_grant_frames;
 
 static union {
 	struct grant_entry_v1 *v1;
+	union grant_entry_v2 *v2;
 	void *addr;
 } gnttab_shared;
 
@@ -121,6 +122,29 @@ struct gnttab_ops {
 	 * by bit operations.
 	 */
 	int (*query_foreign_access)(grant_ref_t ref);
+	/*
+	 * Grant a domain to access a range of bytes within the page referred by
+	 * an available grant entry. Ref parameter is reference of a grant entry
+	 * which will be sub-page accessed, domid is id of grantee domain, frame
+	 * is frame address of subpage grant, flags is grant type and flag
+	 * information, page_off is offset of the range of bytes, and length is
+	 * length of bytes to be accessed.
+	 */
+	void (*update_subpage_entry)(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int flags,
+				     unsigned page_off, unsigned length);
+	/*
+	 * Redirect an available grant entry on domain A to another grant
+	 * reference of domain B, then allow domain C to use grant reference
+	 * of domain B transitively. Ref parameter is an available grant entry
+	 * reference on domain A, domid is id of domain C which accesses grant
+	 * entry transitively, flags is grant type and flag information,
+	 * trans_domid is id of domain B whose grant entry is finally accessed
+	 * transitively, trans_gref is grant entry transitive reference of
+	 * domain B.
+	 */
+	void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags,
+				   domid_t trans_domid, grant_ref_t trans_gref);
 };
 
 struct unmap_refs_callback_data {
@@ -130,6 +154,9 @@ struct unmap_refs_callback_data {
 
 static const struct gnttab_ops *gnttab_interface;
 
+/* This reflects status of grant entries, so act as a global value. */
+static grant_status_t *grstatus;
+
 static int grant_table_version;
 static int grefs_per_grant_frame;
 
@@ -138,6 +165,7 @@ static struct gnttab_free_callback *gnttab_free_callback_list;
 static int gnttab_expand(unsigned int req_entries);
 
 #define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
 
 static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
 {
@@ -210,7 +238,7 @@ static void put_free_entry(grant_ref_t ref)
 }
 
 /*
- * Following applies to gnttab_update_entry_v1.
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
  * Introducing a valid entry into the grant table:
  *  1. Write ent->domid.
  *  2. Write ent->frame:
@@ -229,6 +257,15 @@ static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
 	gnttab_shared.v1[ref].flags = flags;
 }
 
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+				   unsigned long frame, unsigned int flags)
+{
+	gnttab_shared.v2[ref].hdr.domid = domid;
+	gnttab_shared.v2[ref].full_page.frame = frame;
+	wmb();	/* Hypervisor concurrent accesses. */
+	gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
+}
+
 /*
  * Public grant-issuing interface functions
  */
@@ -255,11 +292,132 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 }
 EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
 
+static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid,
+					   unsigned long frame, int flags,
+					   unsigned page_off, unsigned length)
+{
+	gnttab_shared.v2[ref].sub_page.frame = frame;
+	gnttab_shared.v2[ref].sub_page.page_off = page_off;
+	gnttab_shared.v2[ref].sub_page.length = length;
+	gnttab_shared.v2[ref].hdr.domid = domid;
+	wmb();
+	gnttab_shared.v2[ref].hdr.flags =
+				GTF_permit_access | GTF_sub_page | flags;
+}
+
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+					    unsigned long frame, int flags,
+					    unsigned page_off,
+					    unsigned length)
+{
+	if (flags & (GTF_accept_transfer | GTF_reading |
+		     GTF_writing | GTF_transitive))
+		return -EPERM;
+
+	if (gnttab_interface->update_subpage_entry == NULL)
+		return -ENOSYS;
+
+	gnttab_interface->update_subpage_entry(ref, domid, frame, flags,
+					       page_off, length);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref);
+
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+					int flags, unsigned page_off,
+					unsigned length)
+{
+	int ref, rc;
+
+	ref = get_free_entries(1);
+	if (unlikely(ref < 0))
+		return -ENOSPC;
+
+	rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags,
+						     page_off, length);
+	if (rc < 0) {
+		put_free_entry(ref);
+		return rc;
+	}
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage);
+
+bool gnttab_subpage_grants_available(void)
+{
+	return gnttab_interface->update_subpage_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available);
+
+static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid,
+					 int flags, domid_t trans_domid,
+					 grant_ref_t trans_gref)
+{
+	gnttab_shared.v2[ref].transitive.trans_domid = trans_domid;
+	gnttab_shared.v2[ref].transitive.gref = trans_gref;
+	gnttab_shared.v2[ref].hdr.domid = domid;
+	wmb();
+	gnttab_shared.v2[ref].hdr.flags =
+				GTF_permit_access | GTF_transitive | flags;
+}
+
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+					  int flags, domid_t trans_domid,
+					  grant_ref_t trans_gref)
+{
+	if (flags & (GTF_accept_transfer | GTF_reading |
+		     GTF_writing | GTF_sub_page))
+		return -EPERM;
+
+	if (gnttab_interface->update_trans_entry == NULL)
+		return -ENOSYS;
+
+	gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid,
+					     trans_gref);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref);
+
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+				      domid_t trans_domid,
+				      grant_ref_t trans_gref)
+{
+	int ref, rc;
+
+	ref = get_free_entries(1);
+	if (unlikely(ref < 0))
+		return -ENOSPC;
+
+	rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags,
+						   trans_domid, trans_gref);
+	if (rc < 0) {
+		put_free_entry(ref);
+		return rc;
+	}
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans);
+
+bool gnttab_trans_grants_available(void)
+{
+	return gnttab_interface->update_trans_entry != NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_trans_grants_available);
+
 static int gnttab_query_foreign_access_v1(grant_ref_t ref)
 {
 	return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
 }
 
+static int gnttab_query_foreign_access_v2(grant_ref_t ref)
+{
+	return grstatus[ref] & (GTF_reading|GTF_writing);
+}
+
 int gnttab_query_foreign_access(grant_ref_t ref)
 {
 	return gnttab_interface->query_foreign_access(ref);
@@ -282,6 +440,29 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
 	return 1;
 }
 
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+	gnttab_shared.v2[ref].hdr.flags = 0;
+	mb();	/* Concurrent access by hypervisor. */
+	if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+		return 0;
+	} else {
+		/*
+		 * The read of grstatus needs to have acquire semantics.
+		 *  On x86, reads already have that, and we just need to
+		 * protect against compiler reorderings.
+		 * On other architectures we may need a full barrier.
+		 */
+#ifdef CONFIG_X86
+		barrier();
+#else
+		mb();
+#endif
+	}
+
+	return 1;
+}
+
 static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
 {
 	return gnttab_interface->end_foreign_access_ref(ref, readonly);
@@ -442,6 +623,37 @@ static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
 	return frame;
 }
 
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+	u16          *pflags;
+
+	pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+	/*
+	 * If a transfer is not even yet started, try to reclaim the grant
+	 * reference and return failure (== 0).
+	 */
+	while (!((flags = *pflags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(pflags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = *pflags;
+		cpu_relax();
+	}
+
+	rmb();  /* Read the frame number /after/ reading completion status. */
+	frame = gnttab_shared.v2[ref].full_page.frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+
 unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
 {
 	return gnttab_interface->end_foreign_transfer_ref(ref);
@@ -938,6 +1150,12 @@ int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item)
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync);
 
+static unsigned int nr_status_frames(unsigned int nr_grant_frames)
+{
+	BUG_ON(grefs_per_grant_frame == 0);
+	return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP;
+}
+
 static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes)
 {
 	int rc;
@@ -955,6 +1173,55 @@ static void gnttab_unmap_frames_v1(void)
 	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
 }
 
+static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes)
+{
+	uint64_t *sframes;
+	unsigned int nr_sframes;
+	struct gnttab_get_status_frames getframes;
+	int rc;
+
+	nr_sframes = nr_status_frames(nr_gframes);
+
+	/* No need for kzalloc as it is initialized in following hypercall
+	 * GNTTABOP_get_status_frames.
+	 */
+	sframes = kmalloc_array(nr_sframes, sizeof(uint64_t), GFP_ATOMIC);
+	if (!sframes)
+		return -ENOMEM;
+
+	getframes.dom        = DOMID_SELF;
+	getframes.nr_frames  = nr_sframes;
+	set_xen_guest_handle(getframes.frame_list, sframes);
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+				       &getframes, 1);
+	if (rc == -ENOSYS) {
+		kfree(sframes);
+		return -ENOSYS;
+	}
+
+	BUG_ON(rc || getframes.status);
+
+	rc = arch_gnttab_map_status(sframes, nr_sframes,
+				    nr_status_frames(gnttab_max_grant_frames()),
+				    &grstatus);
+	BUG_ON(rc);
+	kfree(sframes);
+
+	rc = arch_gnttab_map_shared(frames, nr_gframes,
+				    gnttab_max_grant_frames(),
+				    &gnttab_shared.addr);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+	arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
+}
+
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
 	struct gnttab_setup_table setup;
@@ -1022,13 +1289,45 @@ static const struct gnttab_ops gnttab_v1_ops = {
 	.query_foreign_access		= gnttab_query_foreign_access_v1,
 };
 
+static const struct gnttab_ops gnttab_v2_ops = {
+	.map_frames			= gnttab_map_frames_v2,
+	.unmap_frames			= gnttab_unmap_frames_v2,
+	.update_entry			= gnttab_update_entry_v2,
+	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v2,
+	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v2,
+	.query_foreign_access		= gnttab_query_foreign_access_v2,
+	.update_subpage_entry		= gnttab_update_subpage_entry_v2,
+	.update_trans_entry		= gnttab_update_trans_entry_v2,
+};
+
 static void gnttab_request_version(void)
 {
-	/* Only version 1 is used, which will always be available. */
-	grant_table_version = 1;
-	grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1);
-	gnttab_interface = &gnttab_v1_ops;
+	int rc;
+	struct gnttab_set_version gsv;
 
+	gsv.version = 1;
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+	if (rc == 0 && gsv.version == 2) {
+		grant_table_version = 2;
+		grefs_per_grant_frame = XEN_PAGE_SIZE /
+					sizeof(union grant_entry_v2);
+		gnttab_interface = &gnttab_v2_ops;
+	} else if (grant_table_version == 2) {
+		/*
+		 * If we've already used version 2 features,
+		 * but then suddenly discover that they're not
+		 * available (e.g. migrating to an older
+		 * version of Xen), almost unbounded badness
+		 * can happen.
+		 */
+		panic("we need grant tables version 2, but only version 1 is available");
+	} else {
+		grant_table_version = 1;
+		grefs_per_grant_frame = XEN_PAGE_SIZE /
+					sizeof(struct grant_entry_v1);
+		gnttab_interface = &gnttab_v1_ops;
+	}
 	pr_info("Grant tables using version %d layout\n", grant_table_version);
 }
 
@@ -1122,7 +1421,8 @@ int gnttab_init(void)
 		}
 	}
 
-	ret = arch_gnttab_init(max_nr_grant_frames);
+	ret = arch_gnttab_init(max_nr_grant_frames,
+			       nr_status_frames(max_nr_grant_frames));
 	if (ret < 0)
 		goto ini_nomem;
 
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 34b1379f9777..dd6c7a32ee32 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -84,6 +84,24 @@ int gnttab_resume(void);
 
 int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 				int readonly);
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+					int flags, unsigned page_off,
+					unsigned length);
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+				      domid_t trans_domid,
+				      grant_ref_t trans_gref);
+
+/*
+ * Are sub-page grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_subpage_grants_available(void);
+
+/*
+ * Are transitive grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_trans_grants_available(void);
 
 /*
  * End access through the given grant reference, iff the grant entry is no
@@ -130,6 +148,13 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
 
 void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 				     unsigned long frame, int readonly);
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+					    unsigned long frame, int flags,
+					    unsigned page_off,
+					    unsigned length);
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+					  int flags, domid_t trans_domid,
+					  grant_ref_t trans_gref);
 
 /* Give access to the first 4K of the page */
 static inline void gnttab_page_grant_foreign_access_ref_one(
@@ -174,10 +199,13 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
 	unmap->dev_bus_addr = 0;
 }
 
-int arch_gnttab_init(unsigned long nr_shared);
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status);
 int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
 			   void **__shared);
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared);
 void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
 
 struct grant_frames {
-- 
cgit v1.2.3


From 223c8f3349f9ce52847dfa266b32761ac223b9ac Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Nov 2017 10:19:20 +0100
Subject: xen: update arch/x86/include/asm/xen/cpuid.h

Update arch/x86/include/asm/xen/cpuid.h from the Xen tree to get newest
definitions.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/cpuid.h | 42 ++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 3bdd10d71223..a9630104f1c4 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -73,22 +73,44 @@
 #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
 
+/*
+ * Leaf 4 (0x40000x03)
+ * Sub-leaf 0: EAX: bit 0: emulated tsc
+ *                  bit 1: host tsc is known to be reliable
+ *                  bit 2: RDTSCP instruction available
+ *             EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate,
+ *                            2=no emulation, 3=no emulation + TSC_AUX support
+ *             ECX: guest tsc frequency in kHz
+ *             EDX: guest tsc incarnation (migration count)
+ * Sub-leaf 1: EAX: tsc offset low part
+ *             EBX: tsc offset high part
+ *             ECX: multiplicator for tsc->ns conversion
+ *             EDX: shift amount for tsc->ns conversion
+ * Sub-leaf 2: EAX: host tsc frequency in kHz
+ */
+
 /*
  * Leaf 5 (0x40000x04)
  * HVM-specific features
- * EAX: Features
- * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: EAX: Features
+ * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
  */
-
-/* Virtualized APIC registers */
-#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0)
-/* Virtualized x2APIC accesses */
-#define XEN_HVM_CPUID_X2APIC_VIRT      (1u << 1)
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT      (1u << 1) /* Virtualized x2APIC accesses */
 /* Memory mapped from other domains has valid IOMMU entries */
 #define XEN_HVM_CPUID_IOMMU_MAPPINGS   (1u << 2)
-/* vcpu id is present in EBX */
-#define XEN_HVM_CPUID_VCPU_ID_PRESENT  (1u << 3)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT  (1u << 3) /* vcpu id is present in EBX */
+
+/*
+ * Leaf 6 (0x40000x05)
+ * PV-specific parameters
+ * Sub-leaf 0: EAX: max available sub-leaf
+ * Sub-leaf 0: EBX: bits 0-7: max machine address width
+ */
+
+/* Max. address width in bits taking memory hotplug into account. */
+#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0)
 
-#define XEN_CPUID_MAX_NUM_LEAVES 4
+#define XEN_CPUID_MAX_NUM_LEAVES 5
 
 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
-- 
cgit v1.2.3


From 9f08890ab906abaf9d4c1bad8111755cbd302260 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 8 Nov 2017 17:19:55 +0000
Subject: x86/pvclock: add setter for pvclock_pvti_cpu0_va

Right now there is only a pvclock_pvti_cpu0_va() which is defined
on kvmclock since:

commit dac16fba6fc5
("x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap")

The only user of this interface so far is kvm. This commit adds a
setter function for the pvti page and moves pvclock_pvti_cpu0_va
to pvclock, which is a more generic place to have it; and would
allow other PV clocksources to use it, such as Xen.

While moving pvclock_pvti_cpu0_va into pvclock, rename also this
function to pvclock_get_pvti_cpu0_va (including its call sites)
to be symmetric with the setter (pvclock_set_pvti_cpu0_va).

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/entry/vdso/vma.c      |  2 +-
 arch/x86/include/asm/pvclock.h | 19 ++++++++++---------
 arch/x86/kernel/kvmclock.c     |  7 +------
 arch/x86/kernel/pvclock.c      | 14 ++++++++++++++
 drivers/ptp/ptp_kvm.c          |  2 +-
 5 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1911310959f8..a77fd3c8d824 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -112,7 +112,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
 	} else if (sym_offset == image->sym_pvclock_page) {
 		struct pvclock_vsyscall_time_info *pvti =
-			pvclock_pvti_cpu0_va();
+			pvclock_get_pvti_cpu0_va();
 		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
 			ret = vm_insert_pfn(
 				vma,
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 448cfe1b48cf..55325f934d71 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,15 +4,6 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
-#ifdef CONFIG_KVM_GUEST
-extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
-#else
-static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
-	return NULL;
-}
-#endif
-
 /* some helper functions for xen and kvm pv clock sources */
 u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -101,4 +92,14 @@ struct pvclock_vsyscall_time_info {
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d88967659098..538738047ff5 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -47,12 +47,6 @@ early_param("no-kvmclock", parse_no_kvmclock);
 static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 
-struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
-	return hv_clock;
-}
-EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
-
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -334,6 +328,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
 		return 1;
 	}
 
+	pvclock_set_pvti_cpu0_va(hv_clock);
 	put_cpu();
 
 	kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5c3f6d6a5078..761f6af6efa5 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -25,8 +25,10 @@
 
 #include <asm/fixmap.h>
 #include <asm/pvclock.h>
+#include <asm/vgtod.h>
 
 static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
 
 void pvclock_set_flags(u8 flags)
 {
@@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+	WARN_ON(vclock_was_used(VCLOCK_PVCLOCK));
+	pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+	return pvti_cpu0_va;
+}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
index e04d7b2ecb3a..c67dd11e08b1 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm.c
@@ -182,7 +182,7 @@ static int __init ptp_kvm_init(void)
 		return -ENODEV;
 
 	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	hv_clock = pvclock_pvti_cpu0_va();
+	hv_clock = pvclock_get_pvti_cpu0_va();
 
 	if (!hv_clock)
 		return -ENODEV;
-- 
cgit v1.2.3


From b888808093113ae7d63d213272d01fea4b8329ed Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 8 Nov 2017 17:19:56 +0000
Subject: x86/xen/time: set pvclock flags on xen_time_init()

Specifically check for PVCLOCK_TSC_STABLE_BIT and if this bit is set,
then set it too on pvclock flags. This allows Xen clocksource to use it
and thus speeding up xen_clocksource_read() callers (i.e. sched_clock())

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/time.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 244791ff8201..c96e61fd70e7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -372,6 +372,7 @@ static const struct pv_time_ops xen_time_ops __initconst = {
 
 static void __init xen_time_init(void)
 {
+	struct pvclock_vcpu_time_info *pvti;
 	int cpu = smp_processor_id();
 	struct timespec tp;
 
@@ -395,6 +396,14 @@ static void __init xen_time_init(void)
 
 	setup_force_cpu_cap(X86_FEATURE_TSC);
 
+	/*
+	 * We check ahead on the primary time info if this
+	 * bit is supported hence speeding up Xen clocksource.
+	 */
+	pvti = &__this_cpu_read(xen_vcpu)->time;
+	if (pvti->flags & PVCLOCK_TSC_STABLE_BIT)
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
-- 
cgit v1.2.3


From 2229f70b5bbb025e1394b61007938a68060afbfb Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 8 Nov 2017 17:19:57 +0000
Subject: x86/xen/time: setup vcpu 0 time info page

In order to support pvclock vdso on xen we need to setup the time
info page for vcpu 0 and register the page with Xen using the
VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall
will also forcefully update the pvti which will set some of the
necessary flags for vdso. Afterwards we check if it supports the
PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having
vdso/vsyscall support. And if so, it will set the cpu 0 pvti that
will be later on used when mapping the vdso image.

The xen headers are also updated to include the new hypercall for
registering the secondary vcpu_time_info struct.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/suspend.c       |  4 ++
 arch/x86/xen/time.c          | 90 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/xen/xen-ops.h       |  2 +
 include/xen/interface/vcpu.h | 42 +++++++++++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index d6b1680693a9..800ed36ecfba 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -16,6 +16,8 @@
 
 void xen_arch_pre_suspend(void)
 {
+	xen_save_time_memory_area();
+
 	if (xen_pv_domain())
 		xen_pv_pre_suspend();
 }
@@ -26,6 +28,8 @@ void xen_arch_post_suspend(int cancelled)
 		xen_pv_post_suspend(cancelled);
 	else
 		xen_hvm_post_suspend(cancelled);
+
+	xen_restore_time_memory_area();
 }
 
 static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c96e61fd70e7..c2041043c606 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -370,6 +370,92 @@ static const struct pv_time_ops xen_time_ops __initconst = {
 	.steal_clock = xen_steal_clock,
 };
 
+static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+
+void xen_save_time_memory_area(void)
+{
+	struct vcpu_register_time_memory_area t;
+	int ret;
+
+	if (!xen_clock)
+		return;
+
+	t.addr.v = NULL;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+	if (ret != 0)
+		pr_notice("Cannot save secondary vcpu_time_info (err %d)",
+			  ret);
+	else
+		clear_page(xen_clock);
+}
+
+void xen_restore_time_memory_area(void)
+{
+	struct vcpu_register_time_memory_area t;
+	int ret;
+
+	if (!xen_clock)
+		return;
+
+	t.addr.v = &xen_clock->pvti;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+
+	/*
+	 * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
+	 * secondary time info with Xen or if we migrated to a host without the
+	 * necessary flags. On both of these cases what happens is either
+	 * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
+	 * bit set. Userspace checks the latter and if 0, it discards the data
+	 * in pvti and fallbacks to a system call for a reliable timestamp.
+	 */
+	if (ret != 0)
+		pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
+			  ret);
+}
+
+static void xen_setup_vsyscall_time_info(void)
+{
+	struct vcpu_register_time_memory_area t;
+	struct pvclock_vsyscall_time_info *ti;
+	int ret;
+
+	ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
+	if (!ti)
+		return;
+
+	t.addr.v = &ti->pvti;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+	if (ret) {
+		pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+		free_page((unsigned long)ti);
+		return;
+	}
+
+	/*
+	 * If primary time info had this bit set, secondary should too since
+	 * it's the same data on both just different memory regions. But we
+	 * still check it in case hypervisor is buggy.
+	 */
+	if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
+		t.addr.v = NULL;
+		ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
+					 0, &t);
+		if (!ret)
+			free_page((unsigned long)ti);
+
+		pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n");
+		return;
+	}
+
+	xen_clock = ti;
+	pvclock_set_pvti_cpu0_va(xen_clock);
+
+	xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
+}
+
 static void __init xen_time_init(void)
 {
 	struct pvclock_vcpu_time_info *pvti;
@@ -401,8 +487,10 @@ static void __init xen_time_init(void)
 	 * bit is supported hence speeding up Xen clocksource.
 	 */
 	pvti = &__this_cpu_read(xen_vcpu)->time;
-	if (pvti->flags & PVCLOCK_TSC_STABLE_BIT)
+	if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+		xen_setup_vsyscall_time_info();
+	}
 
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c8a6d224f7ed..f96dbedb33d4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -69,6 +69,8 @@ void xen_setup_runstate_info(int cpu);
 void xen_teardown_timer(int cpu);
 u64 xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
+void xen_save_time_memory_area(void);
+void xen_restore_time_memory_area(void);
 void __init xen_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
 
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 98188c87f5c1..504c71601511 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -178,4 +178,46 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
 
 /* Send an NMI to the specified VCPU. @extra_arg == NULL. */
 #define VCPUOP_send_nmi             11
+
+/*
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor.  The physical ID informmation is architecture-specific.
+ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+	uint64_t phys_id;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_get_physid);
+#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid))
+#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32))
+
+/*
+ * Register a memory location to get a secondary copy of the vcpu time
+ * parameters.  The master copy still exists as part of the vcpu shared
+ * memory area, and this secondary copy is updated whenever the master copy
+ * is updated (and using the same versioning scheme for synchronisation).
+ *
+ * The intent is that this copy may be mapped (RO) into userspace so
+ * that usermode can compute system time using the time info and the
+ * tsc.  Usermode will see an array of vcpu_time_info structures, one
+ * for each vcpu, and choose the right one by an existing mechanism
+ * which allows it to get the current vcpu number (such as via a
+ * segment limit).  It can then apply the normal algorithm to compute
+ * system time from the tsc.
+ *
+ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
+ */
+#define VCPUOP_register_vcpu_time_memory_area   13
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_time_info);
+struct vcpu_register_time_memory_area {
+	union {
+		GUEST_HANDLE(vcpu_time_info) h;
+		struct pvclock_vcpu_time_info *v;
+		uint64_t p;
+	} addr;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_time_memory_area);
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
-- 
cgit v1.2.3