From 0215b4aa069b989c963d594cf0f1c705e21d8ca5 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 7 Feb 2014 14:50:58 -0500
Subject: powerpc: Fix build failure in sysdev/mpic.c for MPIC_WEIRD=y

Commit 446f6d06fab0b49c61887ecbe8286d6aaa796637 ("powerpc/mpic: Properly
set default triggers") breaks the mpc7447_hpc_defconfig as follows:

  CC      arch/powerpc/sysdev/mpic.o
arch/powerpc/sysdev/mpic.c: In function 'mpic_set_irq_type':
arch/powerpc/sysdev/mpic.c:886:9: error: case label does not reduce to an integer constant
arch/powerpc/sysdev/mpic.c:890:9: error: case label does not reduce to an integer constant
arch/powerpc/sysdev/mpic.c:894:9: error: case label does not reduce to an integer constant
arch/powerpc/sysdev/mpic.c:898:9: error: case label does not reduce to an integer constant

Looking at the cpp output (gcc 4.7.3), I see:

   case mpic->hw_set[MPIC_IDX_VECPRI_SENSE_EDGE] |
        mpic->hw_set[MPIC_IDX_VECPRI_POLARITY_POSITIVE]:

The pointer into an array appears because CONFIG_MPIC_WEIRD=y is set
for this platform, thus enabling the following:

  -------------------
  #ifdef CONFIG_MPIC_WEIRD
  static u32 mpic_infos[][MPIC_IDX_END] = {
        [0] = { /* Original OpenPIC compatible MPIC */

  [...]

  #define MPIC_INFO(name) mpic->hw_set[MPIC_IDX_##name]

  #else /* CONFIG_MPIC_WEIRD */

  #define MPIC_INFO(name) MPIC_##name

  #endif /* CONFIG_MPIC_WEIRD */
  -------------------

Here we convert the case section to if/else if, and also add
the equivalent of a default case to warn about unknown types.
Boot tested on sbc8548, build tested on all defconfigs.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/sysdev/mpic.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 0e166ed4cd16..8209744b2829 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -886,25 +886,25 @@ int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type)
 
 	/* Default: read HW settings */
 	if (flow_type == IRQ_TYPE_DEFAULT) {
-		switch(vold & (MPIC_INFO(VECPRI_POLARITY_MASK) |
-			       MPIC_INFO(VECPRI_SENSE_MASK))) {
-			case MPIC_INFO(VECPRI_SENSE_EDGE) |
-			     MPIC_INFO(VECPRI_POLARITY_POSITIVE):
-				flow_type = IRQ_TYPE_EDGE_RISING;
-				break;
-			case MPIC_INFO(VECPRI_SENSE_EDGE) |
-			     MPIC_INFO(VECPRI_POLARITY_NEGATIVE):
-				flow_type = IRQ_TYPE_EDGE_FALLING;
-				break;
-			case MPIC_INFO(VECPRI_SENSE_LEVEL) |
-			     MPIC_INFO(VECPRI_POLARITY_POSITIVE):
-				flow_type = IRQ_TYPE_LEVEL_HIGH;
-				break;
-			case MPIC_INFO(VECPRI_SENSE_LEVEL) |
-			     MPIC_INFO(VECPRI_POLARITY_NEGATIVE):
-				flow_type = IRQ_TYPE_LEVEL_LOW;
-				break;
-		}
+		int vold_ps;
+
+		vold_ps = vold & (MPIC_INFO(VECPRI_POLARITY_MASK) |
+				  MPIC_INFO(VECPRI_SENSE_MASK));
+
+		if (vold_ps == (MPIC_INFO(VECPRI_SENSE_EDGE) |
+				MPIC_INFO(VECPRI_POLARITY_POSITIVE)))
+			flow_type = IRQ_TYPE_EDGE_RISING;
+		else if	(vold_ps == (MPIC_INFO(VECPRI_SENSE_EDGE) |
+				     MPIC_INFO(VECPRI_POLARITY_NEGATIVE)))
+			flow_type = IRQ_TYPE_EDGE_FALLING;
+		else if (vold_ps == (MPIC_INFO(VECPRI_SENSE_LEVEL) |
+				     MPIC_INFO(VECPRI_POLARITY_POSITIVE)))
+			flow_type = IRQ_TYPE_LEVEL_HIGH;
+		else if (vold_ps == (MPIC_INFO(VECPRI_SENSE_LEVEL) |
+				     MPIC_INFO(VECPRI_POLARITY_NEGATIVE)))
+			flow_type = IRQ_TYPE_LEVEL_LOW;
+		else
+			WARN_ONCE(1, "mpic: unknown IRQ type %d\n", vold);
 	}
 
 	/* Apply to irq desc */
-- 
cgit v1.2.3


From 8cc6b6cd8713457be80202fc4264f05d20bc5e1b Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Date: Wed, 5 Feb 2014 16:20:45 -0200
Subject: powerpc/eeh: Drop taken reference to driver on eeh_rmv_device

Commit f5c57710dd62dd06f176934a8b4b8accbf00f9f8 ("powerpc/eeh: Use
partial hotplug for EEH unaware drivers") introduces eeh_rmv_device,
which may grab a reference to a driver, but not release it.

That prevents a driver from being removed after it has gone through EEH
recovery.

This patch drops the reference if it was taken.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Acked-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/eeh_driver.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7bb30dca4e19..fdc679d309ec 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -362,9 +362,13 @@ static void *eeh_rmv_device(void *data, void *userdata)
 	 */
 	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
 		return NULL;
+
 	driver = eeh_pcid_get(dev);
-	if (driver && driver->err_handler)
-		return NULL;
+	if (driver) {
+		eeh_pcid_put(dev);
+		if (driver->err_handler)
+			return NULL;
+	}
 
 	/* Remove it from PCI subsystem */
 	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
-- 
cgit v1.2.3


From 3ec8b78fcc5aa7745026d8d85a4e9ab52c922765 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 00:30:45 +0530
Subject: powerpc/pseries: Disable relocation on exception while going down
 during crash.

Disable relocation on exception while going down even in kdump case. This
is because we are about clear htab mappings while kexec-ing into kdump
kernel and we may run into issues if we still have AIL ON.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/setup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 8e639d7cbda7..972df0ffd4dc 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -430,8 +430,7 @@ static void pSeries_machine_kexec(struct kimage *image)
 {
 	long rc;
 
-	if (firmware_has_feature(FW_FEATURE_SET_MODE) &&
-	    (image->type != KEXEC_TYPE_CRASH)) {
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
 		rc = pSeries_disable_reloc_on_exc();
 		if (rc != H_SUCCESS)
 			pr_warning("Warning: Failed to disable relocation on "
-- 
cgit v1.2.3


From 429d2e8342954d337abe370d957e78291032d867 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 00:31:04 +0530
Subject: powerpc: Fix kdump hang issue on p8 with relocation on exception
 enabled.

On p8 systems, with relocation on exception feature enabled we are seeing
kdump kernel hang at interrupt vector 0xc*4400. The reason is, with this
feature enabled, exception are raised with MMU (IR=DR=1) ON with the
default offset of 0xc*4000. Since exception is raised in virtual mode it
requires the vector region to be executable without which it fails to
fetch and execute instruction at 0xc*4xxx. For default kernel since kernel
is loaded at real 0, the htab mappings sets the entire kernel text region
executable. But for relocatable kernel (e.g. kdump case) we only copy
interrupt vectors down to real 0 and never marked that region as
executable because in p7 and below we always get exception in real mode.

This patch fixes this issue by marking htab mapping range as executable
that overlaps with the interrupt vector region for relocatable kernel.

Thanks to Ben who helped me to debug this issue and find the root cause.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/sections.h | 12 ++++++++++++
 arch/powerpc/mm/hash_utils_64.c     | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 4ee06fe15de4..d0e784e0ff48 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -8,6 +8,7 @@
 
 #ifdef __powerpc64__
 
+extern char __start_interrupts[];
 extern char __end_interrupts[];
 
 extern char __prom_init_toc_start[];
@@ -21,6 +22,17 @@ static inline int in_kernel_text(unsigned long addr)
 	return 0;
 }
 
+static inline int overlaps_interrupt_vector_text(unsigned long start,
+							unsigned long end)
+{
+	unsigned long real_start, real_end;
+	real_start = __start_interrupts - _stext;
+	real_end = __end_interrupts - _stext;
+
+	return start < (unsigned long)__va(real_end) &&
+		(unsigned long)__va(real_start) < end;
+}
+
 static inline int overlaps_kernel_text(unsigned long start, unsigned long end)
 {
 	return start < (unsigned long)__init_end &&
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index de6881259aef..d766d6ee33fe 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -207,6 +207,20 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		if (overlaps_kernel_text(vaddr, vaddr + step))
 			tprot &= ~HPTE_R_N;
 
+		/*
+		 * If relocatable, check if it overlaps interrupt vectors that
+		 * are copied down to real 0. For relocatable kernel
+		 * (e.g. kdump case) we copy interrupt vectors down to real
+		 * address 0. Mark that region as executable. This is
+		 * because on p8 system with relocation on exception feature
+		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+		 * in order to execute the interrupt handlers in virtual
+		 * mode the vector region need to be marked as executable.
+		 */
+		if ((PHYSICAL_START > MEMORY_START) &&
+			overlaps_interrupt_vector_text(vaddr, vaddr + step))
+				tprot &= ~HPTE_R_N;
+
 		hash = hpt_hash(vpn, shift, ssize);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
-- 
cgit v1.2.3


From 3b830c824a8b15bf032eafe09c48a09d2441f16d Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 16:58:42 +0100
Subject: powerpc/relocate fix relocate processing in LE mode

Relocation's code is not working in little endian mode because the r_info
field, which is a 64 bits value, should be read from the right offset.

The current code is optimized to read the r_info field as a 32 bits value
starting at the middle of the double word (offset 12). When running in LE
mode, the read value is not correct since only the MSB is read.

This patch removes this optimization which consist to deal with a 32 bits
value instead of a 64 bits one. This way it works in big and little endian
mode.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/reloc_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S
index b47a0e1ab001..1482327cfeba 100644
--- a/arch/powerpc/kernel/reloc_64.S
+++ b/arch/powerpc/kernel/reloc_64.S
@@ -69,8 +69,8 @@ _GLOBAL(relocate)
 	 * R_PPC64_RELATIVE ones.
 	 */
 	mtctr	r8
-5:	lwz	r0,12(9)	/* ELF64_R_TYPE(reloc->r_info) */
-	cmpwi	r0,R_PPC64_RELATIVE
+5:	ld	r0,8(9)		/* ELF64_R_TYPE(reloc->r_info) */
+	cmpdi	r0,R_PPC64_RELATIVE
 	bne	6f
 	ld	r6,0(r9)	/* reloc->r_offset */
 	ld	r0,16(r9)	/* reloc->r_addend */
-- 
cgit v1.2.3


From 2fdd313f54eca8221125b7898e6a8393604b6fb6 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 24 Jan 2014 15:50:51 +1100
Subject: powerpc/perf: Add Power8 cache & TLB events

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/perf/power8-pmu.c | 144 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index a3f7abd2f13f..96cee20dcd34 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -25,6 +25,37 @@
 #define PM_BRU_FIN			0x10068
 #define PM_BR_MPRED_CMPL		0x400f6
 
+/* All L1 D cache load references counted at finish, gated by reject */
+#define PM_LD_REF_L1			0x100ee
+/* Load Missed L1 */
+#define PM_LD_MISS_L1			0x3e054
+/* Store Missed L1 */
+#define PM_ST_MISS_L1			0x300f0
+/* L1 cache data prefetches */
+#define PM_L1_PREF			0x0d8b8
+/* Instruction fetches from L1 */
+#define PM_INST_FROM_L1			0x04080
+/* Demand iCache Miss */
+#define PM_L1_ICACHE_MISS		0x200fd
+/* Instruction Demand sectors wriittent into IL1 */
+#define PM_L1_DEMAND_WRITE		0x0408c
+/* Instruction prefetch written into IL1 */
+#define PM_IC_PREF_WRITE		0x0408e
+/* The data cache was reloaded from local core's L3 due to a demand load */
+#define PM_DATA_FROM_L3			0x4c042
+/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
+#define PM_DATA_FROM_L3MISS		0x300fe
+/* All successful D-side store dispatches for this thread */
+#define PM_L2_ST			0x17080
+/* All successful D-side store dispatches for this thread that were L2 Miss */
+#define PM_L2_ST_MISS			0x17082
+/* Total HW L3 prefetches(Load+store) */
+#define PM_L3_PREF_ALL			0x4e052
+/* Data PTEG reload */
+#define PM_DTLB_MISS			0x300fc
+/* ITLB Reloaded */
+#define PM_ITLB_MISS			0x400fc
+
 
 /*
  * Raw event encoding for POWER8:
@@ -557,6 +588,8 @@ static int power8_generic_events[] = {
 	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PM_BRU_FIN,
 	[PERF_COUNT_HW_BRANCH_MISSES] =			PM_BR_MPRED_CMPL,
+	[PERF_COUNT_HW_CACHE_REFERENCES] =		PM_LD_REF_L1,
+	[PERF_COUNT_HW_CACHE_MISSES] =			PM_LD_MISS_L1,
 };
 
 static u64 power8_bhrb_filter_map(u64 branch_sample_type)
@@ -596,6 +629,116 @@ static void power8_config_bhrb(u64 pmu_bhrb_filter)
 	mtspr(SPRN_MMCRA, (mfspr(SPRN_MMCRA) | pmu_bhrb_filter));
 }
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power8_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[ C(L1D) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = PM_LD_REF_L1,
+			[ C(RESULT_MISS)   ] = PM_LD_MISS_L1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = PM_ST_MISS_L1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = PM_L1_PREF,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(L1I) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = PM_INST_FROM_L1,
+			[ C(RESULT_MISS)   ] = PM_L1_ICACHE_MISS,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = PM_L1_DEMAND_WRITE,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = PM_IC_PREF_WRITE,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(LL) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = PM_DATA_FROM_L3,
+			[ C(RESULT_MISS)   ] = PM_DATA_FROM_L3MISS,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = PM_L2_ST,
+			[ C(RESULT_MISS)   ] = PM_L2_ST_MISS,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = PM_L3_PREF_ALL,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(DTLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = PM_DTLB_MISS,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(ITLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = PM_ITLB_MISS,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(BPU) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = PM_BRU_FIN,
+			[ C(RESULT_MISS)   ] = PM_BR_MPRED_CMPL,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(NODE) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+};
+
+#undef C
+
 static struct power_pmu power8_pmu = {
 	.name			= "POWER8",
 	.n_counter		= 6,
@@ -611,6 +754,7 @@ static struct power_pmu power8_pmu = {
 	.flags			= PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB | PPMU_EBB,
 	.n_generic		= ARRAY_SIZE(power8_generic_events),
 	.generic_events		= power8_generic_events,
+	.cache_events		= &power8_cache_events,
 	.attr_groups		= power8_pmu_attr_groups,
 	.bhrb_nr		= 32,
 };
-- 
cgit v1.2.3


From 8d4887ee30695d6b092e96029ae974505c43bfb6 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 10 Dec 2013 14:02:49 +1100
Subject: powerpc/pseries: Select ARCH_RANDOM on pseries

We have a driver for the ARCH_RANDOM hook in rng.c, so we should select
ARCH_RANDOM on pseries.

Without this the build breaks if you turn ARCH_RANDOM off.

This hasn't broken the build because pseries_defconfig doesn't specify a
value for PPC_POWERNV, which is default y, and selects ARCH_RANDOM.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 37300f6ee244..80b1d57c306a 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -20,6 +20,7 @@ config PPC_PSERIES
 	select PPC_DOORBELL
 	select HAVE_CONTEXT_TRACKING
 	select HOTPLUG_CPU if SMP
+	select ARCH_RANDOM
 	default y
 
 config PPC_SPLPAR
-- 
cgit v1.2.3


From b4d6c06c8de81517320a9964b0c26e096aeadd7b Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Wed, 18 Dec 2013 13:14:53 +1100
Subject: powerpc/perf: Configure BHRB filter before enabling PMU interrupts

Right now the config_bhrb() PMU specific call happens after
write_mmcr0(), which actually enables the PMU for event counting and
interrupts. So there is a small window of time where the PMU and BHRB
runs without the required HW branch filter (if any) enabled in BHRB.

This can cause some of the branch samples to be collected through BHRB
without any filter applied and hence affects the correctness of
the results. This patch moves the BHRB config function call before
enabling interrupts.

Here are some data points captured via trace prints which depicts how we
could get PMU interrupts with BHRB filter NOT enabled with a standard
perf record command line (asking for branch record information as well).

    $ perf record -j any_call ls

Before the patch:-

    ls-1962  [003] d...  2065.299590: .perf_event_interrupt: MMCRA: 40000000000
    ls-1962  [003] d...  2065.299603: .perf_event_interrupt: MMCRA: 40000000000
    ...

    All the PMU interrupts before this point did not have the requested
    HW branch filter enabled in the MMCRA.

    ls-1962  [003] d...  2065.299647: .perf_event_interrupt: MMCRA: 40040000000
    ls-1962  [003] d...  2065.299662: .perf_event_interrupt: MMCRA: 40040000000

After the patch:-

    ls-1850  [008] d...   190.311828: .perf_event_interrupt: MMCRA: 40040000000
    ls-1850  [008] d...   190.311848: .perf_event_interrupt: MMCRA: 40040000000

    All the PMU interrupts have the requested HW BHRB branch filter
    enabled in MMCRA.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[mpe: Fixed up whitespace and cleaned up changelog]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/perf/core-book3s.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 29b89e863d7c..67cf22083f4c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1147,6 +1147,9 @@ static void power_pmu_enable(struct pmu *pmu)
 	mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]);
 
 	mb();
+	if (cpuhw->bhrb_users)
+		ppmu->config_bhrb(cpuhw->bhrb_filter);
+
 	write_mmcr0(cpuhw, mmcr0);
 
 	/*
@@ -1158,8 +1161,6 @@ static void power_pmu_enable(struct pmu *pmu)
 	}
 
  out:
-	if (cpuhw->bhrb_users)
-		ppmu->config_bhrb(cpuhw->bhrb_filter);
 
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 730efb6193f8568354fd80849612291afa9fa81e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Mon, 23 Dec 2013 23:46:04 +1100
Subject: powerpc/xmon: Don't loop forever in get_output_lock()

If we enter with xmon_speaker != 0 we skip the first cmpxchg(), we also
skip the while loop because xmon_speaker != last_speaker (0) - meaning we
skip the second cmpxchg() also.

Following that code path the compiler sees no memory barriers and so is
within its rights to never reload xmon_speaker. The end result is we loop
forever.

This manifests as all cpus being in xmon ('c' command), but they refuse
to take control when you switch to them ('c x' for cpu # x).

I have seen this deadlock in practice and also checked the generated code to
confirm this is what's happening.

The simplest fix is just to always try the cmpxchg().

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/xmon/xmon.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index a90731b3d44a..598cdc7c7adc 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -309,12 +309,12 @@ static void get_output_lock(void)
 
 	if (xmon_speaker == me)
 		return;
+
 	for (;;) {
-		if (xmon_speaker == 0) {
-			last_speaker = cmpxchg(&xmon_speaker, 0, me);
-			if (last_speaker == 0)
-				return;
-		}
+		last_speaker = cmpxchg(&xmon_speaker, 0, me);
+		if (last_speaker == 0)
+			return;
+
 		timeout = 10000000;
 		while (xmon_speaker == last_speaker) {
 			if (--timeout > 0)
-- 
cgit v1.2.3


From 1507589787529b0d8e2a9e66e0c6f113ecab5181 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 23 Dec 2013 23:46:05 +1100
Subject: powerpc/xmon: Fix timeout loop in get_output_lock()

As far as I can tell, our 70s era timeout loop in get_output_lock() is
generating no code.

This leads to the hostile takeover happening more or less simultaneously
on all cpus. The result is "interesting", some example output that is
more readable than most:

    cpu 0x1: Vector: 100 (Scypsut e0mx bR:e setV)e catto xc0p:u[ c 00
    c0:0  000t0o0V0erc0td:o5 rfc28050000]0c00 0 0  0 6t(pSrycsV1ppuot
    uxe 1m 2 0Rx21e3:0s0ce000c00000t00)00 60602oV2SerucSayt0y 0p 1sxs

Fix it by using udelay() in the timeout loop. The wait time and check
frequency are arbitrary, but seem to work OK. We already rely on
udelay() working so this is not a new dependency.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/xmon/xmon.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 598cdc7c7adc..d712b23974ec 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -315,10 +315,17 @@ static void get_output_lock(void)
 		if (last_speaker == 0)
 			return;
 
-		timeout = 10000000;
+		/*
+		 * Wait a full second for the lock, we might be on a slow
+		 * console, but check every 100us.
+		 */
+		timeout = 10000;
 		while (xmon_speaker == last_speaker) {
-			if (--timeout > 0)
+			if (--timeout > 0) {
+				udelay(100);
 				continue;
+			}
+
 			/* hostile takeover */
 			prev = cmpxchg(&xmon_speaker, last_speaker, me);
 			if (prev == last_speaker)
-- 
cgit v1.2.3


From d2b496e5e1fa7a6796534e435440eb9d3ed184dd Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 23 Dec 2013 23:46:06 +1100
Subject: powerpc/xmon: Don't signal we've entered until we're finished
 printing

Currently we set our cpu's bit in cpus_in_xmon, and then we take the
output lock and print the exception information.

This can race with the master cpu entering the command loop and printing
the backtrace. The result is that the backtrace gets garbled with
another cpu's exception print out.

Fix it by delaying the set of cpus_in_xmon until we are finished
printing.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/xmon/xmon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d712b23974ec..b07909850f77 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -404,7 +404,6 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 	}
 
 	xmon_fault_jmp[cpu] = recurse_jmp;
-	cpumask_set_cpu(cpu, &cpus_in_xmon);
 
 	bp = NULL;
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT))
@@ -426,6 +425,8 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		release_output_lock();
 	}
 
+	cpumask_set_cpu(cpu, &cpus_in_xmon);
+
  waiting:
 	secondary = 1;
 	while (secondary && !xmon_gate) {
-- 
cgit v1.2.3


From 04a341138da55daf47551308660a5fcc6b5b9077 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Wed, 29 Jan 2014 18:24:54 +0800
Subject: powerpc/ppc32: Fix the bug in the init of non-base exception stack
 for UP

We would allocate one specific exception stack for each kind of
non-base exceptions for every CPU. For ppc32 the CPU hard ID is
used as the subscript to get the specific exception stack for
one CPU. But for an UP kernel, there is only one element in the
each kind of exception stack array. We would get stuck if the
CPU hard ID is not equal to '0'. So in this case we should use the
subscript '0' no matter what the CPU hard ID is.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/irq.c      | 5 +++++
 arch/powerpc/kernel/setup_32.c | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 9729b23bfb0a..1d0848bba049 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -559,8 +559,13 @@ void exc_lvl_ctx_init(void)
 #ifdef CONFIG_PPC64
 		cpu_nr = i;
 #else
+#ifdef CONFIG_SMP
 		cpu_nr = get_hard_smp_processor_id(i);
+#else
+		cpu_nr = 0;
 #endif
+#endif
+
 		memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
 		tp = critirq_ctx[cpu_nr];
 		tp->cpu = cpu_nr;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 2b0da27eaee4..04cc4fcca78b 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -247,7 +247,12 @@ static void __init exc_lvl_early_init(void)
 	/* interrupt stacks must be in lowmem, we get that for free on ppc32
 	 * as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */
 	for_each_possible_cpu(i) {
+#ifdef CONFIG_SMP
 		hw_cpu = get_hard_smp_processor_id(i);
+#else
+		hw_cpu = 0;
+#endif
+
 		critirq_ctx[hw_cpu] = (struct thread_info *)
 			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
 #ifdef CONFIG_BOOKE
-- 
cgit v1.2.3


From ea961a828fe7250e954f086d74d9323c3d44c3e4 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 22 Jan 2014 08:40:28 +1100
Subject: powerpc: Fix endian issues in kexec and crash dump code

We expose a number of OF properties in the kexec and crash dump code
and these need to be big endian.

Cc: stable@vger.kernel.org # v3.13
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/machine_kexec.c    | 14 ++++++++++----
 arch/powerpc/kernel/machine_kexec_64.c |  6 ++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 75d4f7340da8..015ae55c1868 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -196,7 +196,9 @@ int overlaps_crashkernel(unsigned long start, unsigned long size)
 
 /* Values we need to export to the second kernel via the device tree. */
 static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
 static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
 
 static struct property kernel_end_prop = {
 	.name = "linux,kernel-end",
@@ -207,7 +209,7 @@ static struct property kernel_end_prop = {
 static struct property crashk_base_prop = {
 	.name = "linux,crashkernel-base",
 	.length = sizeof(phys_addr_t),
-	.value = &crashk_res.start,
+	.value = &crashk_base
 };
 
 static struct property crashk_size_prop = {
@@ -219,9 +221,11 @@ static struct property crashk_size_prop = {
 static struct property memory_limit_prop = {
 	.name = "linux,memory-limit",
 	.length = sizeof(unsigned long long),
-	.value = &memory_limit,
+	.value = &mem_limit,
 };
 
+#define cpu_to_be_ulong	__PASTE(cpu_to_be, BITS_PER_LONG)
+
 static void __init export_crashk_values(struct device_node *node)
 {
 	struct property *prop;
@@ -237,8 +241,9 @@ static void __init export_crashk_values(struct device_node *node)
 		of_remove_property(node, prop);
 
 	if (crashk_res.start != 0) {
+		crashk_base = cpu_to_be_ulong(crashk_res.start),
 		of_add_property(node, &crashk_base_prop);
-		crashk_size = resource_size(&crashk_res);
+		crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
 		of_add_property(node, &crashk_size_prop);
 	}
 
@@ -246,6 +251,7 @@ static void __init export_crashk_values(struct device_node *node)
 	 * memory_limit is required by the kexec-tools to limit the
 	 * crash regions to the actual memory used.
 	 */
+	mem_limit = cpu_to_be_ulong(memory_limit);
 	of_update_property(node, &memory_limit_prop);
 }
 
@@ -264,7 +270,7 @@ static int __init kexec_setup(void)
 		of_remove_property(node, prop);
 
 	/* information needed by userspace when using default_machine_kexec */
-	kernel_end = __pa(_end);
+	kernel_end = cpu_to_be_ulong(__pa(_end));
 	of_add_property(node, &kernel_end_prop);
 
 	export_crashk_values(node);
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index be4e6d648f60..59d229a2a3e0 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -369,6 +369,7 @@ void default_machine_kexec(struct kimage *image)
 
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
+static unsigned long htab_size;
 
 static struct property htab_base_prop = {
 	.name = "linux,htab-base",
@@ -379,7 +380,7 @@ static struct property htab_base_prop = {
 static struct property htab_size_prop = {
 	.name = "linux,htab-size",
 	.length = sizeof(unsigned long),
-	.value = &htab_size_bytes,
+	.value = &htab_size,
 };
 
 static int __init export_htab_values(void)
@@ -403,8 +404,9 @@ static int __init export_htab_values(void)
 	if (prop)
 		of_remove_property(node, prop);
 
-	htab_base = __pa(htab_address);
+	htab_base = cpu_to_be64(__pa(htab_address));
 	of_add_property(node, &htab_base_prop);
+	htab_size = cpu_to_be64(htab_size_bytes);
 	of_add_property(node, &htab_size_prop);
 
 	of_node_put(node);
-- 
cgit v1.2.3


From cd15b048445d0a54f7147c35a86c5a16ef231554 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 11 Feb 2014 11:32:38 +1100
Subject: powerpc/powernv: Add iommu DMA bypass support for IODA2

This patch adds the support for to create a direct iommu "bypass"
window on IODA2 bridges (such as Power8) allowing to bypass iommu
page translation completely for 64-bit DMA capable devices, thus
significantly improving DMA performances.

Additionally, this adds a hook to the struct iommu_table so that
the IOMMU API / VFIO can disable the bypass when external ownership
is requested, since in that case, the device will be used by an
environment such as userspace or a KVM guest which must not be
allowed to bypass translations.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/dma-mapping.h    |  1 +
 arch/powerpc/include/asm/iommu.h          |  1 +
 arch/powerpc/kernel/dma.c                 | 10 ++--
 arch/powerpc/kernel/iommu.c               | 12 +++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 84 +++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c      | 10 ++++
 arch/powerpc/platforms/powernv/pci.h      |  6 ++-
 arch/powerpc/platforms/powernv/powernv.h  |  8 +++
 arch/powerpc/platforms/powernv/setup.c    |  9 ++++
 9 files changed, 137 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index e27e9ad6818e..150866b2a3fe 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -134,6 +134,7 @@ static inline int dma_supported(struct device *dev, u64 mask)
 }
 
 extern int dma_set_mask(struct device *dev, u64 dma_mask);
+extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 
 #define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
 
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f7a8036579b5..42632c7a2a4e 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -77,6 +77,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
 	struct iommu_group *it_group;
 #endif
+	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 8032b97ccdcb..ee78f6e49d64 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -191,12 +191,10 @@ EXPORT_SYMBOL(dma_direct_ops);
 
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 
-int dma_set_mask(struct device *dev, u64 dma_mask)
+int __dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	if (ppc_md.dma_set_mask)
-		return ppc_md.dma_set_mask(dev, dma_mask);
 	if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
 		return dma_ops->set_dma_mask(dev, dma_mask);
 	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
@@ -204,6 +202,12 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 	*dev->dma_mask = dma_mask;
 	return 0;
 }
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (ppc_md.dma_set_mask)
+		return ppc_md.dma_set_mask(dev, dma_mask);
+	return __dma_set_mask(dev, dma_mask);
+}
 EXPORT_SYMBOL(dma_set_mask);
 
 u64 dma_get_required_mask(struct device *dev)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d773dd440a45..88e3ec6e1d96 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1088,6 +1088,14 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	memset(tbl->it_map, 0xff, sz);
 	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 
+	/*
+	 * Disable iommu bypass, otherwise the user can DMA to all of
+	 * our physical memory via the bypass window instead of just
+	 * the pages that has been explicitly mapped into the iommu
+	 */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, false);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1102,6 +1110,10 @@ void iommu_release_ownership(struct iommu_table *tbl)
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
+
+	/* The kernel owns the device now, we can restore the iommu bypass */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7d6dcc6d5fa9..3b2b4fb3585b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -21,6 +21,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/memblock.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -460,9 +461,39 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 		return;
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
+	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
 	set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
 }
 
+static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
+				     struct pci_dev *pdev, u64 dma_mask)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+	uint64_t top;
+	bool bypass = false;
+
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return -ENODEV;;
+
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	if (pe->tce_bypass_enabled) {
+		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+		bypass = (dma_mask >= top);
+	}
+
+	if (bypass) {
+		dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
+		set_dma_ops(&pdev->dev, &dma_direct_ops);
+		set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+	} else {
+		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
+		set_dma_ops(&pdev->dev, &dma_iommu_ops);
+		set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+	}
+	return 0;
+}
+
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 {
 	struct pci_dev *dev;
@@ -657,6 +688,56 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+{
+	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+					      tce32_table);
+	uint16_t window_id = (pe->pe_number << 1 ) + 1;
+	int64_t rc;
+
+	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
+	if (enable) {
+		phys_addr_t top = memblock_end_of_DRAM();
+
+		top = roundup_pow_of_two(top);
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     top);
+	} else {
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     0);
+
+		/*
+		 * We might want to reset the DMA ops of all devices on
+		 * this PE. However in theory, that shouldn't be necessary
+		 * as this is used for VFIO/KVM pass-through and the device
+		 * hasn't yet been returned to its kernel driver
+		 */
+	}
+	if (rc)
+		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
+	else
+		pe->tce_bypass_enabled = enable;
+}
+
+static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
+					  struct pnv_ioda_pe *pe)
+{
+	/* TVE #1 is selected by PCI address bit 59 */
+	pe->tce_bypass_base = 1ull << 59;
+
+	/* Install set_bypass callback for VFIO */
+	pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
+
+	/* Enable bypass by default */
+	pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
+}
+
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
@@ -727,6 +808,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	else
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
+	/* Also create a bypass window */
+	pnv_pci_ioda2_setup_bypass_pe(phb, pe);
 	return;
 fail:
 	if (pe->tce32_seg >= 0)
@@ -1286,6 +1369,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
 
 	/* Setup TCEs */
 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
 
 	/* Setup shutdown function for kexec */
 	phb->shutdown = pnv_pci_ioda_shutdown;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index b555ebc57ef5..95633d79ef5d 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -634,6 +634,16 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 		pnv_pci_dma_fallback_setup(hose, pdev);
 }
 
+int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	if (phb && phb->dma_set_mask)
+		return phb->dma_set_mask(phb, pdev, dma_mask);
+	return __dma_set_mask(&pdev->dev, dma_mask);
+}
+
 void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 13f1942a9a5f..cde169442775 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -54,7 +54,9 @@ struct pnv_ioda_pe {
 	struct iommu_table	tce32_table;
 	phys_addr_t		tce_inval_reg_phys;
 
-	/* XXX TODO: Add support for additional 64-bit iommus */
+	/* 64-bit TCE bypass region */
+	bool			tce_bypass_enabled;
+	uint64_t		tce_bypass_base;
 
 	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
 	 * and -1 if not supported. (It's actually identical to the
@@ -113,6 +115,8 @@ struct pnv_phb {
 			 unsigned int hwirq, unsigned int virq,
 			 unsigned int is_64, struct msi_msg *msg);
 	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
+	int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
+			    u64 dma_mask);
 	void (*fixup_phb)(struct pci_controller *hose);
 	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
 	void (*shutdown)(struct pnv_phb *phb);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index de6819be1f95..0051e108ef0f 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -7,12 +7,20 @@ extern void pnv_smp_init(void);
 static inline void pnv_smp_init(void) { }
 #endif
 
+struct pci_dev;
+
 #ifdef CONFIG_PCI
 extern void pnv_pci_init(void);
 extern void pnv_pci_shutdown(void);
+extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
 #else
 static inline void pnv_pci_init(void) { }
 static inline void pnv_pci_shutdown(void) { }
+
+static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+	return -ENODEV;
+}
 #endif
 
 extern void pnv_lpc_init(void);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 21166f65c97c..110f4fbd319f 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -27,6 +27,7 @@
 #include <linux/interrupt.h>
 #include <linux/bug.h>
 #include <linux/cpuidle.h>
+#include <linux/pci.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
@@ -141,6 +142,13 @@ static void pnv_progress(char *s, unsigned short hex)
 {
 }
 
+static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (dev_is_pci(dev))
+		return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
+	return __dma_set_mask(dev, dma_mask);
+}
+
 static void pnv_shutdown(void)
 {
 	/* Let the PCI code clear up IODA tables */
@@ -238,6 +246,7 @@ define_machine(powernv) {
 	.machine_shutdown	= pnv_shutdown,
 	.power_save             = powernv_idle,
 	.calibrate_decr		= generic_calibrate_decr,
+	.dma_set_mask		= pnv_dma_set_mask,
 #ifdef CONFIG_KEXEC
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
 #endif
-- 
cgit v1.2.3