60 files changed, 2363 insertions, 595 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 445cb6e39d5b..fcc9a89a4695 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC_A2)		+= cpu_setup_a2.o
@@ -47,7 +48,6 @@ obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)	+= idle_power7.o
 obj-$(CONFIG_PPC_OF)		+= of_platform.o prom_parse.o
-obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 2ea5cc033ec8..b5aacf72ae6f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -203,6 +203,15 @@ int main(void)
 	DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack));
 	DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack));
 	DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack));
+	DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr));
+
+	DEFINE(TCD_ESEL_NEXT,
+		offsetof(struct tlb_core_data, esel_next));
+	DEFINE(TCD_ESEL_MAX,
+		offsetof(struct tlb_core_data, esel_max));
+	DEFINE(TCD_ESEL_FIRST,
+		offsetof(struct tlb_core_data, esel_first));
+	DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock));
 #endif /* CONFIG_PPC_BOOK3E */
 
 #ifdef CONFIG_PPC_STD_MMU_64
@@ -232,6 +241,10 @@ int main(void)
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp));
+	DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
+#endif
 	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
 	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
 	DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime));
@@ -425,18 +438,14 @@ int main(void)
 	DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
-	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
-	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr));
 #ifdef CONFIG_ALTIVEC
-	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
-	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
-#endif
-#ifdef CONFIG_VSX
-	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr));
 #endif
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar));
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -484,16 +493,24 @@ int main(void)
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
+	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
+	DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
 	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
 	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
 	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr));
 	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
 	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
+	DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx));
+	DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr));
+	DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx));
+	DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
@@ -502,8 +519,10 @@ int main(void)
 	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc));
 	DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar));
 	DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar));
+	DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
@@ -511,20 +530,47 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
-	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
+	DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+	DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
+	DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
+	DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
+	DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr));
+	DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr));
+	DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr));
+	DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
+	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
+	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
 	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
+	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
 	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
 	DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
 	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
+	DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
 	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
 	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
+	DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
+	DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
+	DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm));
+	DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr));
+	DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
+	DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
+	DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
+	DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
+	DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
+	DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
+	DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm));
+	DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm));
+	DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm));
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -576,6 +622,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
 	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
 	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+	HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
 	HSTATE_FIELD(HSTATE_NAPPING, napping);
@@ -588,6 +635,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 654932727873..2912b8787aa4 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -12,7 +12,6 @@
 
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
@@ -794,6 +793,9 @@ static void remove_cache_dir(struct cache_dir *cache_dir)
 {
 	remove_index_dirs(cache_dir);
 
+	/* Remove cache dir from sysfs */
+	kobject_del(cache_dir->kobj);
+
 	kobject_put(cache_dir->kobj);
 
 	kfree(cache_dir);
diff --git a/arch/powerpc/kernel/clock.c b/arch/powerpc/kernel/clock.c
deleted file mode 100644
index a764b47791e8..000000000000
--- a/arch/powerpc/kernel/clock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Dummy clk implementations for powerpc.
- * These need to be overridden in platform code.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <asm/clk_interface.h>
-
-struct clk_interface clk_functions;
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-	if (clk_functions.clk_get)
-		return clk_functions.clk_get(dev, id);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-	if (clk_functions.clk_put)
-		clk_functions.clk_put(clk);
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
-	if (clk_functions.clk_enable)
-		return clk_functions.clk_enable(clk);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
-	if (clk_functions.clk_disable)
-		clk_functions.clk_disable(clk);
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-	if (clk_functions.clk_get_rate)
-		return clk_functions.clk_get_rate(clk);
-	return 0;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-long clk_round_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_round_rate)
-		return clk_functions.clk_round_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_round_rate);
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_set_rate)
-		return clk_functions.clk_set_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-struct clk *clk_get_parent(struct clk *clk)
-{
-	if (clk_functions.clk_get_parent)
-		return clk_functions.clk_get_parent(clk);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get_parent);
-
-int clk_set_parent(struct clk *clk, struct clk *parent)
-{
-	if (clk_functions.clk_set_parent)
-		return clk_functions.clk_set_parent(clk, parent);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_parent);
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index bfb18c7290b7..cc2d8962e090 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -53,11 +53,57 @@ _GLOBAL(__e500_dcache_setup)
 	isync
 	blr
 
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for PW20_WAIT_IDLE_BIT.
+ */
+#define PW20_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_pw20_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Set PW20_WAIT bit, enable pw20 state*/
+	ori	r3, r3, PWRMGTCR0_PW20_WAIT
+	li	r11, PW20_WAIT_IDLE_BIT
+
+	/* Set Automatic PW20 Core Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for AV_WAIT_IDLE_BIT.
+ */
+#define AV_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_altivec_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Enable Altivec Idle */
+	oris	r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h
+	li	r11, AV_WAIT_IDLE_BIT
+
+	/* Set Automatic AltiVec Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
 _GLOBAL(__setup_cpu_e6500)
 	mflr	r6
 #ifdef CONFIG_PPC64
 	bl	.setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	.setup_lrat_ivor
+1:
 #endif
+	bl	setup_pw20_idle
+	bl	setup_altivec_idle
 	bl	__setup_cpu_e5500
 	mtlr	r6
 	blr
@@ -119,6 +165,14 @@ _GLOBAL(__setup_cpu_e5500)
 _GLOBAL(__restore_cpu_e6500)
 	mflr	r5
 	bl	.setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	.setup_lrat_ivor
+1:
+	bl	.setup_pw20_idle
+	bl	.setup_altivec_idle
 	bl	__restore_cpu_e5500
 	mtlr	r5
 	blr
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 18b5b9cf8e37..37d1bb002aa9 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,7 +29,7 @@ _GLOBAL(__setup_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_tlb_power7
 	mtlr	r11
 	blr
 
@@ -42,7 +42,7 @@ _GLOBAL(__restore_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_tlb_power7
 	mtlr	r11
 	blr
 
@@ -59,7 +59,7 @@ _GLOBAL(__setup_cpu_power8)
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
 	bl	__init_HFSCR
-	bl	__init_TLB
+	bl	__init_tlb_power8
 	bl	__init_PMU_HV
 	mtlr	r11
 	blr
@@ -78,7 +78,7 @@ _GLOBAL(__restore_cpu_power8)
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
 	bl	__init_HFSCR
-	bl	__init_TLB
+	bl	__init_tlb_power8
 	bl	__init_PMU_HV
 	mtlr	r11
 	blr
@@ -134,15 +134,31 @@ __init_HFSCR:
 	mtspr	SPRN_HFSCR,r3
 	blr
 
-__init_TLB:
-	/*
-	 * Clear the TLB using the "IS 3" form of tlbiel instruction
-	 * (invalidate by congruence class). P7 has 128 CCs, P8 has 512
-	 * so we just always do 512
-	 */
+/*
+ * Clear the TLB using the specified IS form of tlbiel instruction
+ * (invalidate by congruence class). P7 has 128 CCs., P8 has 512.
+ *
+ * r3 = IS field
+ */
+__init_tlb_power7:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power7)
+	li	r6,128
+	mtctr	r6
+	mr	r7,r3		/* IS field */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:	blr
+
+__init_tlb_power8:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power8)
 	li	r6,512
 	mtctr	r6
-	li	r7,0xc00	/* IS field = 0b11 */
+	mr	r7,r3		/* IS field */
 	ptesync
 2:	tlbiel	r7
 	addi	r7,r7,0x1000
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 597d954e5860..6c8dd5da4de5 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -71,6 +71,10 @@ extern void __restore_cpu_power7(void);
 extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power8(void);
 extern void __restore_cpu_a2(void);
+extern void __flush_tlb_power7(unsigned long inval_selector);
+extern void __flush_tlb_power8(unsigned long inval_selector);
+extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
@@ -440,6 +444,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
 	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
@@ -456,6 +462,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Power7 */
@@ -474,6 +482,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
 	{	/* Power7+ */
@@ -492,6 +502,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7+",
 	},
 	{	/* Power8E */
@@ -510,6 +522,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_INVALID,
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Power8 */
@@ -528,6 +542,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_INVALID,
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Cell Broadband Engine */
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index fdcd8f551aff..18d7c80ddeb9 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -17,7 +17,6 @@
 #include <linux/export.h>
 #include <linux/crash_dump.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/types.h>
 
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 779a78c26435..11c1d069d920 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -124,15 +124,15 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
 {
 	unsigned long addr;
-	const u32 *basep, *sizep;
+	const __be32 *basep, *sizep;
 	unsigned int rtas_start = 0, rtas_end = 0;
 
 	basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
 	sizep = of_get_property(rtas.dev, "rtas-size", NULL);
 
 	if (basep && sizep) {
-		rtas_start = *basep;
-		rtas_end = *basep + *sizep;
+		rtas_start = be32_to_cpup(basep);
+		rtas_end = rtas_start + be32_to_cpup(sizep);
 	}
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index e4897523de41..54d0116256f7 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -83,10 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask)
 		return 0;
 	}
 
-	if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT)) {
+	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
 		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
 		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
-				mask, tbl->it_offset << IOMMU_PAGE_SHIFT);
+				mask, tbl->it_offset << tbl->it_page_shift);
 		return 0;
 	} else
 		return 1;
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 8032b97ccdcb..ee78f6e49d64 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -191,12 +191,10 @@ EXPORT_SYMBOL(dma_direct_ops);
 
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 
-int dma_set_mask(struct device *dev, u64 dma_mask)
+int __dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	struct dma_map_ops *dma_ops = get_dma_ops(dev);
 
-	if (ppc_md.dma_set_mask)
-		return ppc_md.dma_set_mask(dev, dma_mask);
 	if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
 		return dma_ops->set_dma_mask(dev, dma_mask);
 	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
@@ -204,6 +202,12 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 	*dev->dma_mask = dma_mask;
 	return 0;
 }
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (ppc_md.dma_set_mask)
+		return ppc_md.dma_set_mask(dev, dma_mask);
+	return __dma_set_mask(dev, dma_mask);
+}
 EXPORT_SYMBOL(dma_set_mask);
 
 u64 dma_get_required_mask(struct device *dev)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 4bd687d5e7aa..148db72a8c43 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -84,7 +84,7 @@
 #define EEH_MAX_FAILS	2100000
 
 /* Time to wait for a PCI slot to report status, in milliseconds */
-#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
+#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000)
 
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
@@ -921,6 +921,13 @@ void eeh_add_device_late(struct pci_dev *dev)
 		eeh_sysfs_remove_device(edev->pdev);
 		edev->mode &= ~EEH_DEV_SYSFS;
 
+		/*
+		 * We definitely should have the PCI device removed
+		 * though it wasn't correctly. So we needn't call
+		 * into error handler afterwards.
+		 */
+		edev->mode |= EEH_DEV_NO_HANDLER;
+
 		edev->pdev = NULL;
 		dev->dev.archdata.edev = NULL;
 	}
@@ -1023,6 +1030,14 @@ void eeh_remove_device(struct pci_dev *dev)
 	else
 		edev->mode |= EEH_DEV_DISCONNECTED;
 
+	/*
+	 * We're removing from the PCI subsystem, that means
+	 * the PCI device driver can't support EEH or not
+	 * well. So we rely on hotplug completely to do recovery
+	 * for the specific PCI device.
+	 */
+	edev->mode |= EEH_DEV_NO_HANDLER;
+
 	eeh_addr_cache_rmv_dev(dev);
 	eeh_sysfs_remove_device(dev);
 	edev->mode &= ~EEH_DEV_SYSFS;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 36bed5a12750..fdc679d309ec 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -217,7 +217,8 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
 	if (!driver) return NULL;
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->mmio_enabled) {
+	    !driver->err_handler->mmio_enabled ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -258,7 +259,8 @@ static void *eeh_report_reset(void *data, void *userdata)
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset) {
+	    !driver->err_handler->slot_reset ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -297,7 +299,9 @@ static void *eeh_report_resume(void *data, void *userdata)
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->resume) {
+	    !driver->err_handler->resume ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+		edev->mode &= ~EEH_DEV_NO_HANDLER;
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -358,9 +362,13 @@ static void *eeh_rmv_device(void *data, void *userdata)
 	 */
 	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
 		return NULL;
+
 	driver = eeh_pcid_get(dev);
-	if (driver && driver->err_handler)
-		return NULL;
+	if (driver) {
+		eeh_pcid_put(dev);
+		if (driver->err_handler)
+			return NULL;
+	}
 
 	/* Remove it from PCI subsystem */
 	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
@@ -369,7 +377,9 @@ static void *eeh_rmv_device(void *data, void *userdata)
 	edev->mode |= EEH_DEV_DISCONNECTED;
 	(*removed)++;
 
+	pci_lock_rescan_remove();
 	pci_stop_and_remove_bus_device(dev);
+	pci_unlock_rescan_remove();
 
 	return NULL;
 }
@@ -416,10 +426,13 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 	 * into pcibios_add_pci_devices().
 	 */
 	eeh_pe_state_mark(pe, EEH_PE_KEEP);
-	if (bus)
+	if (bus) {
+		pci_lock_rescan_remove();
 		pcibios_remove_pci_devices(bus);
-	else if (frozen_bus)
+		pci_unlock_rescan_remove();
+	} else if (frozen_bus) {
 		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
+	}
 
 	/* Reset the pci controller. (Asserts RST#; resets config space).
 	 * Reconfigure bridges and devices. Don't try to bring the system
@@ -429,6 +442,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 	if (rc)
 		return rc;
 
+	pci_lock_rescan_remove();
+
 	/* Restore PE */
 	eeh_ops->configure_bridge(pe);
 	eeh_pe_restore_bars(pe);
@@ -462,13 +477,14 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 	pe->tstamp = tstamp;
 	pe->freeze_count = cnt;
 
+	pci_unlock_rescan_remove();
 	return 0;
 }
 
 /* The longest amount of time to wait for a pci device
  * to come back on line, in seconds.
  */
-#define MAX_WAIT_FOR_RECOVERY 150
+#define MAX_WAIT_FOR_RECOVERY 300
 
 static void eeh_handle_normal_event(struct eeh_pe *pe)
 {
@@ -618,92 +634,103 @@ perm_error:
 	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
 
 	/* Shut down the device drivers for good. */
-	if (frozen_bus)
+	if (frozen_bus) {
+		pci_lock_rescan_remove();
 		pcibios_remove_pci_devices(frozen_bus);
+		pci_unlock_rescan_remove();
+	}
 }
 
 static void eeh_handle_special_event(void)
 {
 	struct eeh_pe *pe, *phb_pe;
 	struct pci_bus *bus;
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	unsigned long flags;
-	int rc = 0;
+	int rc;
 
-	/*
-	 * The return value from next_error() has been classified as follows.
-	 * It might be good to enumerate them. However, next_error() is only
-	 * supported by PowerNV platform for now. So it would be fine to use
-	 * integer directly:
-	 *
-	 * 4 - Dead IOC           3 - Dead PHB
-	 * 2 - Fenced PHB         1 - Frozen PE
-	 * 0 - No error found
-	 *
-	 */
-	rc = eeh_ops->next_error(&pe);
-	if (rc <= 0)
-		return;
 
-	switch (rc) {
-	case 4:
-		/* Mark all PHBs in dead state */
-		eeh_serialize_lock(&flags);
-		list_for_each_entry_safe(hose, tmp,
-				&hose_list, list_node) {
-			phb_pe = eeh_phb_pe_get(hose);
-			if (!phb_pe) continue;
-
-			eeh_pe_state_mark(phb_pe,
-				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+	do {
+		rc = eeh_ops->next_error(&pe);
+
+		switch (rc) {
+		case EEH_NEXT_ERR_DEAD_IOC:
+			/* Mark all PHBs in dead state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events */
+			eeh_remove_event(NULL);
+
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe) continue;
+
+				eeh_pe_state_mark(phb_pe,
+					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+			}
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_FROZEN_PE:
+		case EEH_NEXT_ERR_FENCED_PHB:
+		case EEH_NEXT_ERR_DEAD_PHB:
+			/* Mark the PE in fenced state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events of the PHB */
+			eeh_remove_event(pe);
+
+			if (rc == EEH_NEXT_ERR_DEAD_PHB)
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+			else
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_NONE:
+			return;
+		default:
+			pr_warn("%s: Invalid value %d from next_error()\n",
+				__func__, rc);
+			return;
 		}
-		eeh_serialize_unlock(flags);
-
-		/* Purge all events */
-		eeh_remove_event(NULL);
-		break;
-	case 3:
-	case 2:
-	case 1:
-		/* Mark the PE in fenced state */
-		eeh_serialize_lock(&flags);
-		if (rc == 3)
-			eeh_pe_state_mark(pe,
-				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
-		else
-			eeh_pe_state_mark(pe,
-				EEH_PE_ISOLATED | EEH_PE_RECOVERING);
-		eeh_serialize_unlock(flags);
-
-		/* Purge all events of the PHB */
-		eeh_remove_event(pe);
-		break;
-	default:
-		pr_err("%s: Invalid value %d from next_error()\n",
-		       __func__, rc);
-		return;
-	}
 
-	/*
-	 * For fenced PHB and frozen PE, it's handled as normal
-	 * event. We have to remove the affected PHBs for dead
-	 * PHB and IOC
-	 */
-	if (rc == 2 || rc == 1)
-		eeh_handle_normal_event(pe);
-	else {
-		list_for_each_entry_safe(hose, tmp,
-			&hose_list, list_node) {
-			phb_pe = eeh_phb_pe_get(hose);
-			if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
-				continue;
-
-			bus = eeh_pe_bus_get(phb_pe);
-			/* Notify all devices that they're about to go down. */
-			eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
-			pcibios_remove_pci_devices(bus);
+		/*
+		 * For fenced PHB and frozen PE, it's handled as normal
+		 * event. We have to remove the affected PHBs for dead
+		 * PHB and IOC
+		 */
+		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_handle_normal_event(pe);
+		} else {
+			pci_lock_rescan_remove();
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe ||
+				    !(phb_pe->state & EEH_PE_PHB_DEAD))
+					continue;
+
+				/* Notify all devices to be down */
+				bus = eeh_pe_bus_get(phb_pe);
+				eeh_pe_dev_traverse(pe,
+					eeh_report_failure, NULL);
+				pcibios_remove_pci_devices(bus);
+			}
+			pci_unlock_rescan_remove();
 		}
-	}
+
+		/*
+		 * If we have detected dead IOC, we needn't proceed
+		 * any more since all PHBs would have been removed
+		 */
+		if (rc == EEH_NEXT_ERR_DEAD_IOC)
+			break;
+	} while (rc != EEH_NEXT_ERR_NONE);
 }
 
 /**
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index f9450537e335..f0c353fa655a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -25,7 +25,6 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/string.h>
@@ -737,6 +736,9 @@ static void *eeh_restore_one_device_bars(void *data, void *flag)
 	else
 		eeh_restore_device_bars(edev, dn);
 
+	if (eeh_ops->restore_config)
+		eeh_ops->restore_config(dn);
+
 	return NULL;
 }
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index bbfb0294b354..662c6dd98072 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -664,8 +664,16 @@ _GLOBAL(ret_from_except_lite)
 	bl	.restore_interrupts
 	SCHEDULE_USER
 	b	.ret_from_except_lite
-
-2:	bl	.save_nvgprs
+2:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	andi.	r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+	bne	3f		/* only restore TM if nothing else to do */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.restore_tm_state
+	b	restore
+3:
+#endif
+	bl	.save_nvgprs
 	bl	.restore_interrupts
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_notify_resume
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index e7751561fd1d..063b65dd4f27 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -308,6 +308,7 @@ interrupt_base_book3e:					/* fake trap */
 	EXCEPTION_STUB(0x2e0, guest_doorbell_crit)
 	EXCEPTION_STUB(0x300, hypercall)
 	EXCEPTION_STUB(0x320, ehpriv)
+	EXCEPTION_STUB(0x340, lrat_error)
 
 	.globl interrupt_end_book3e
 interrupt_end_book3e:
@@ -677,6 +678,17 @@ kernel_dbg_exc:
 	bl	.unknown_exception
 	b	.ret_from_except
 
+/* LRAT Error interrupt */
+	START_EXCEPTION(lrat_error);
+	NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x340, PACA_EXGEN, INTS_KEEP)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.save_nvgprs
+	INTS_RESTORE_HARD
+	bl	.unknown_exception
+	b	.ret_from_except
+
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
  * accordingly and if the interrupt is level sensitive, we hard disable
@@ -859,6 +871,7 @@ BAD_STACK_TRAMPOLINE(0x2e0)
 BAD_STACK_TRAMPOLINE(0x300)
 BAD_STACK_TRAMPOLINE(0x310)
 BAD_STACK_TRAMPOLINE(0x320)
+BAD_STACK_TRAMPOLINE(0x340)
 BAD_STACK_TRAMPOLINE(0x400)
 BAD_STACK_TRAMPOLINE(0x500)
 BAD_STACK_TRAMPOLINE(0x600)
@@ -1055,12 +1068,9 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	mtspr	SPRN_MAS0,r3
 	tlbre
 	mfspr	r6,SPRN_MAS1
-	rlwinm	r6,r6,0,2,0	/* clear IPROT */
+	rlwinm	r6,r6,0,2,31	/* clear IPROT and VALID */
 	mtspr	SPRN_MAS1,r6
 	tlbwe
-
-	/* Invalidate TLB1 */
-	PPC_TLBILX_ALL(0,R0)
 	sync
 	isync
 
@@ -1114,12 +1124,9 @@ skpinv:	addi	r6,r6,1				/* Increment */
 	mtspr	SPRN_MAS0,r4
 	tlbre
 	mfspr	r5,SPRN_MAS1
-	rlwinm	r5,r5,0,2,0	/* clear IPROT */
+	rlwinm	r5,r5,0,2,31	/* clear IPROT and VALID */
 	mtspr	SPRN_MAS1,r5
 	tlbwe
-
-	/* Invalidate TLB1 */
-	PPC_TLBILX_ALL(0,R0)
 	sync
 	isync
 
@@ -1414,3 +1421,7 @@ _GLOBAL(setup_ehv_ivors)
 	SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
 	SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
 	blr
+
+_GLOBAL(setup_lrat_ivor)
+	SET_IVOR(42, 0x340) /* LRAT Error */
+	blr
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9f905e40922e..38d507306a11 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -155,8 +155,30 @@ machine_check_pSeries_1:
 	 */
 	HMT_MEDIUM_PPR_DISCARD
 	SET_SCRATCH0(r13)		/* save r13 */
+#ifdef CONFIG_PPC_P7_NAP
+BEGIN_FTR_SECTION
+	/* Running native on arch 2.06 or later, check if we are
+	 * waking up from nap. We only handle no state loss and
+	 * supervisor state loss. We do -not- handle hypervisor
+	 * state loss at this time.
+	 */
+	mfspr	r13,SPRN_SRR1
+	rlwinm.	r13,r13,47-31,30,31
+	beq	9f
+
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
+	/* Total loss of HV state is fatal. let's just stay stuck here */
+	bgt	cr1,.
+9:
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+	b	machine_check_pSeries_early
+FTR_SECTION_ELSE
 	b	machine_check_pSeries_0
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 
 	. = 0x300
 	.globl data_access_pSeries
@@ -405,6 +427,64 @@ denorm_exception_hv:
 
 	.align	7
 	/* moved from 0x200 */
+machine_check_pSeries_early:
+BEGIN_FTR_SECTION
+	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+	/*
+	 * Register contents:
+	 * R13		= PACA
+	 * R9		= CR
+	 * Original R9 to R13 is saved on PACA_EXMC
+	 *
+	 * Switch to mc_emergency stack and handle re-entrancy (though we
+	 * currently don't test for overflow). Save MCE registers srr1,
+	 * srr0, dar and dsisr and then set ME=1
+	 *
+	 * We use paca->in_mce to check whether this is the first entry or
+	 * nested machine check. We increment paca->in_mce to track nested
+	 * machine checks.
+	 *
+	 * If this is the first entry then set stack pointer to
+	 * paca->mc_emergency_sp, otherwise r1 is already pointing to
+	 * stack frame on mc_emergency stack.
+	 *
+	 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
+	 * checkstop if we get another machine check exception before we do
+	 * rfid with MSR_ME=1.
+	 */
+	mr	r11,r1			/* Save r1 */
+	lhz	r10,PACA_IN_MCE(r13)
+	cmpwi	r10,0			/* Are we in nested machine check */
+	bne	0f			/* Yes, we are. */
+	/* First machine check entry */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
+	addi	r10,r10,1		/* increment paca->in_mce */
+	sth	r10,PACA_IN_MCE(r13)
+	std	r11,GPR1(r1)		/* Save r1 on the stack. */
+	std	r11,0(r1)		/* make stack chain pointer */
+	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
+	std	r11,_NIP(r1)
+	mfspr	r11,SPRN_SRR1		/* Save SRR1 */
+	std	r11,_MSR(r1)
+	mfspr	r11,SPRN_DAR		/* Save DAR */
+	std	r11,_DAR(r1)
+	mfspr	r11,SPRN_DSISR		/* Save DSISR */
+	std	r11,_DSISR(r1)
+	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	/* Save r9 through r13 from EXMC save area to stack frame. */
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	mfmsr	r11			/* get MSR value */
+	ori	r11,r11,MSR_ME		/* turn on ME bit */
+	ori	r11,r11,MSR_RI		/* turn on RI bit */
+	ld	r12,PACAKBASE(r13)	/* get high part of &label */
+	LOAD_HANDLER(r12, machine_check_handle_early)
+	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r11
+	rfid
+	b	.	/* prevent speculative execution */
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+
 machine_check_pSeries:
 	.globl machine_check_fwnmi
 machine_check_fwnmi:
@@ -688,30 +768,6 @@ kvmppc_skip_Hinterrupt:
 
 	STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception)
 
-	/*
-	 * Machine check is different because we use a different
-	 * save area: PACA_EXMC instead of PACA_EXGEN.
-	 */
-	.align	7
-	.globl machine_check_common
-machine_check_common:
-
-	mfspr	r10,SPRN_DAR
-	std	r10,PACA_EXGEN+EX_DAR(r13)
-	mfspr	r10,SPRN_DSISR
-	stw	r10,PACA_EXGEN+EX_DSISR(r13)
-	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
-	FINISH_NAP
-	DISABLE_INTS
-	ld	r3,PACA_EXGEN+EX_DAR(r13)
-	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-	bl	.save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.machine_check_exception
-	b	.ret_from_except
-
 	STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ)
 	STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt)
 	STD_EXCEPTION_COMMON(0x980, hdecrementer, .hdec_interrupt)
@@ -1080,6 +1136,30 @@ unrecov_user_slb:
 #endif /* __DISABLED__ */
 
 
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	.align	7
+	.globl machine_check_common
+machine_check_common:
+
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
+	FINISH_NAP
+	DISABLE_INTS
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.machine_check_exception
+	b	.ret_from_except
+
 	.align	7
 	.globl alignment_common
 alignment_common:
@@ -1263,6 +1343,120 @@ _GLOBAL(opal_mc_secondary_handler)
 #endif /* CONFIG_PPC_POWERNV */
 
 
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r0,MSR_RI;				\
+	mfmsr	r9;		/* get MSR value */	\
+	andc	r9,r9,r0;				\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Move original SRR0 and SRR1 into the respective regs */	\
+	ld	r9,_MSR(r1);				\
+	mtspr	SPRN_SRR1,r9;				\
+	ld	r3,_NIP(r1);				\
+	mtspr	SPRN_SRR0,r3;				\
+	ld	r9,_CTR(r1);				\
+	mtctr	r9;					\
+	ld	r9,_XER(r1);				\
+	mtxer	r9;					\
+	ld	r9,_LINK(r1);				\
+	mtlr	r9;					\
+	REST_GPR(0, r1);				\
+	REST_8GPRS(2, r1);				\
+	REST_GPR(10, r1);				\
+	ld	r11,_CCR(r1);				\
+	mtcr	r11;					\
+	/* Decrement paca->in_mce. */			\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	REST_GPR(11, r1);				\
+	REST_2GPRS(12, r1);				\
+	/* restore original r1. */			\
+	ld	r1,GPR1(r1)
+
+	/*
+	 * Handle machine check early in real mode. We come here with
+	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
+	 */
+	.align	7
+	.globl machine_check_handle_early
+machine_check_handle_early:
+	std	r0,GPR0(r1)	/* Save r0 */
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.machine_check_early
+	ld	r12,_MSR(r1)
+#ifdef	CONFIG_PPC_P7_NAP
+	/*
+	 * Check if thread was in power saving mode. We come here when any
+	 * of the following is true:
+	 * a. thread wasn't in power saving mode
+	 * b. thread was in power saving mode with no state loss or
+	 *    supervisor state loss
+	 *
+	 * Go back to nap again if (b) is true.
+	 */
+	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
+	beq	4f			/* No, it wasn;t */
+	/* Thread was in power saving mode. Go back to nap again. */
+	cmpwi	r11,2
+	bne	3f
+	/* Supervisor state loss */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
+3:	bl	.machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	GET_PACA(r13)
+	ld	r1,PACAR1(r13)
+	b	.power7_enter_nap_mode
+4:
+#endif
+	/*
+	 * Check if we are coming from hypervisor userspace. If yes then we
+	 * continue in host kernel in V mode to deliver the MC event.
+	 */
+	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
+	beq	5f
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	9f			/* continue in V mode if we are. */
+
+5:
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	/*
+	 * We are coming from kernel context. Check if we are coming from
+	 * guest. if yes, then we can continue. We will fall through
+	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
+	 */
+	lbz	r11,HSTATE_IN_GUEST(r13)
+	cmpwi	r11,0			/* Check if coming from guest */
+	bne	9f			/* continue if we are. */
+#endif
+	/*
+	 * At this point we are not sure about what context we come from.
+	 * Queue up the MCE event and return from the interrupt.
+	 * But before that, check if this is an un-recoverable exception.
+	 * If yes, then stay on emergency stack and panic.
+	 */
+	andi.	r11,r12,MSR_RI
+	bne	2f
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+2:
+	/*
+	 * Return from MC interrupt.
+	 * Queue up the MCE event so that we can log it later, while
+	 * returning from kernel or opal call.
+	 */
+	bl	.machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	rfid
+9:
+	/* Deliver the machine check to host kernel in V mode. */
+	MACHINE_CHECK_HANDLER_WINDUP
+	b	machine_check_pSeries
+
 /*
  * r13 points to the PACA, r9 contains the saved CR,
  * r12 contain the saved SRR1, SRR0 is still ready for return
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index f7f5b8bed68f..9ad236e5d2c9 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -81,6 +81,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 /*
+ * Enable use of the FPU, and VSX if possible, for the caller.
+ */
+_GLOBAL(fp_enable)
+	mfmsr	r3
+	ori	r3,r3,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r3,r3,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	SYNC
+	MTMSRD(r3)
+	isync			/* (not necessary for arch 2.02 and later) */
+	blr
+
+/*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.
  */
diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
index a92c79be2728..f22e7e44fbf3 100644
--- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S
+++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
@@ -176,6 +176,8 @@ skpinv:	addi	r6,r6,1				/* Increment */
 /* 7. Jump to KERNELBASE mapping */
 	lis	r6,(KERNELBASE & ~0xfff)@h
 	ori	r6,r6,(KERNELBASE & ~0xfff)@l
+	rlwinm	r7,r25,0,0x03ffffff
+	add	r6,r7,r6
 
 #elif defined(ENTRY_MAPPING_KEXEC_SETUP)
 /*
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 2ae41aba4053..b7363bd42452 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -23,6 +23,7 @@
  */
 
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <asm/reg.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -80,6 +81,7 @@ END_FTR_SECTION(0, 1)
 	 * of the function that the cpu should jump to to continue
 	 * initialization.
 	 */
+	.balign 8
 	.globl  __secondary_hold_spinloop
 __secondary_hold_spinloop:
 	.llong	0x0
@@ -470,6 +472,7 @@ _STATIC(__after_prom_start)
 	mtctr	r8
 	bctr
 
+.balign 8
 p_end:	.llong	_end - _stext
 
 4:	/* Now copy the rest of the kernel up to _end */
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index f45726a1d963..b497188a94a1 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -65,29 +65,78 @@ _ENTRY(_start);
 	nop
 
 	/* Translate device tree address to physical, save in r30/r31 */
-	mfmsr	r16
-	mfspr	r17,SPRN_PID
-	rlwinm	r17,r17,16,0x3fff0000	/* turn PID into MAS6[SPID] */
-	rlwimi	r17,r16,28,0x00000001	/* turn MSR[DS] into MAS6[SAS] */
-	mtspr	SPRN_MAS6,r17
-
-	tlbsx	0,r3			/* must succeed */
-
-	mfspr	r16,SPRN_MAS1
-	mfspr	r20,SPRN_MAS3
-	rlwinm	r17,r16,25,0x1f		/* r17 = log2(page size) */
-	li	r18,1024
-	slw	r18,r18,r17		/* r18 = page size */
-	addi	r18,r18,-1
-	and	r19,r3,r18		/* r19 = page offset */
-	andc	r31,r20,r18		/* r31 = page base */
-	or	r31,r31,r19		/* r31 = devtree phys addr */
-	mfspr	r30,SPRN_MAS7
+	bl	get_phys_addr
+	mr	r30,r3
+	mr	r31,r4
 
 	li	r25,0			/* phys kernel start (low) */
 	li	r24,0			/* CPU number */
 	li	r23,0			/* phys kernel start (high) */
 
+#ifdef CONFIG_RELOCATABLE
+	LOAD_REG_ADDR_PIC(r3, _stext)	/* Get our current runtime base */
+
+	/* Translate _stext address to physical, save in r23/r25 */
+	bl	get_phys_addr
+	mr	r23,r3
+	mr	r25,r4
+
+	bl	0f
+0:	mflr	r8
+	addis	r3,r8,(is_second_reloc - 0b)@ha
+	lwz	r19,(is_second_reloc - 0b)@l(r3)
+
+	/* Check if this is the second relocation. */
+	cmpwi	r19,1
+	bne	1f
+
+	/*
+	 * For the second relocation, we already get the real memstart_addr
+	 * from device tree. So we will map PAGE_OFFSET to memstart_addr,
+	 * then the virtual address of start kernel should be:
+	 *          PAGE_OFFSET + (kernstart_addr - memstart_addr)
+	 * Since the offset between kernstart_addr and memstart_addr should
+	 * never be beyond 1G, so we can just use the lower 32bit of them
+	 * for the calculation.
+	 */
+	lis	r3,PAGE_OFFSET@h
+
+	addis	r4,r8,(kernstart_addr - 0b)@ha
+	addi	r4,r4,(kernstart_addr - 0b)@l
+	lwz	r5,4(r4)
+
+	addis	r6,r8,(memstart_addr - 0b)@ha
+	addi	r6,r6,(memstart_addr - 0b)@l
+	lwz	r7,4(r6)
+
+	subf	r5,r7,r5
+	add	r3,r3,r5
+	b	2f
+
+1:
+	/*
+	 * We have the runtime (virutal) address of our base.
+	 * We calculate our shift of offset from a 64M page.
+	 * We could map the 64M page we belong to at PAGE_OFFSET and
+	 * get going from there.
+	 */
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	rlwinm	r6,r25,0,0x3ffffff		/* r6 = PHYS_START % 64M */
+	rlwinm	r5,r4,0,0x3ffffff		/* r5 = KERNELBASE % 64M */
+	subf	r3,r5,r6			/* r3 = r6 - r5 */
+	add	r3,r4,r3			/* Required Virtual Address */
+
+2:	bl	relocate
+
+	/*
+	 * For the second relocation, we already set the right tlb entries
+	 * for the kernel space, so skip the code in fsl_booke_entry_mapping.S
+	*/
+	cmpwi	r19,1
+	beq	set_ivor
+#endif
+
 /* We try to not make any assumptions about how the boot loader
  * setup or used the TLBs.  We invalidate all mappings from the
  * boot loader and load a single entry in TLB1[0] to map the
@@ -113,6 +162,7 @@ _ENTRY(__early_start)
 #include "fsl_booke_entry_mapping.S"
 #undef ENTRY_MAPPING_BOOT_SETUP
 
+set_ivor:
 	/* Establish the interrupt vector offsets */
 	SET_IVOR(0,  CriticalInput);
 	SET_IVOR(1,  MachineCheck);
@@ -166,8 +216,7 @@ _ENTRY(__early_start)
 	/* Check to see if we're the second processor, and jump
 	 * to the secondary_start code if so
 	 */
-	lis	r24, boot_cpuid@h
-	ori	r24, r24, boot_cpuid@l
+	LOAD_REG_ADDR_PIC(r24, boot_cpuid)
 	lwz	r24, 0(r24)
 	cmpwi	r24, -1
 	mfspr   r24,SPRN_PIR
@@ -197,6 +246,18 @@ _ENTRY(__early_start)
 
 	bl	early_init
 
+#ifdef CONFIG_RELOCATABLE
+	mr	r3,r30
+	mr	r4,r31
+#ifdef CONFIG_PHYS_64BIT
+	mr	r5,r23
+	mr	r6,r25
+#else
+	mr	r5,r25
+#endif
+	bl	relocate_init
+#endif
+
 #ifdef CONFIG_DYNAMIC_MEMSTART
 	lis	r3,kernstart_addr@ha
 	la	r3,kernstart_addr@l(r3)
@@ -856,6 +917,33 @@ KernelSPE:
 #endif /* CONFIG_SPE */
 
 /*
+ * Translate the effec addr in r3 to phys addr. The phys addr will be put
+ * into r3(higher 32bit) and r4(lower 32bit)
+ */
+get_phys_addr:
+	mfmsr	r8
+	mfspr	r9,SPRN_PID
+	rlwinm	r9,r9,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	rlwimi	r9,r8,28,0x00000001	/* turn MSR[DS] into MAS6[SAS] */
+	mtspr	SPRN_MAS6,r9
+
+	tlbsx	0,r3			/* must succeed */
+
+	mfspr	r8,SPRN_MAS1
+	mfspr	r12,SPRN_MAS3
+	rlwinm	r9,r8,25,0x1f		/* r9 = log2(page size) */
+	li	r10,1024
+	slw	r10,r10,r9		/* r10 = page size */
+	addi	r10,r10,-1
+	and	r11,r3,r10		/* r11 = page offset */
+	andc	r4,r12,r10		/* r4 = page base */
+	or	r4,r4,r11		/* r4 = devtree phys addr */
+#ifdef CONFIG_PHYS_64BIT
+	mfspr	r3,SPRN_MAS7
+#endif
+	blr
+
+/*
  * Global functions
  */
 
@@ -1057,24 +1145,36 @@ _GLOBAL(__flush_disable_L1)
 /* When we get here, r24 needs to hold the CPU # */
 	.globl __secondary_start
 __secondary_start:
-	lis	r3,__secondary_hold_acknowledge@h
-	ori	r3,r3,__secondary_hold_acknowledge@l
-	stw	r24,0(r3)
-
-	li	r3,0
-	mr	r4,r24		/* Why? */
-	bl	call_setup_cpu
-
-	lis	r3,tlbcam_index@ha
-	lwz	r3,tlbcam_index@l(r3)
+	LOAD_REG_ADDR_PIC(r3, tlbcam_index)
+	lwz	r3,0(r3)
 	mtctr	r3
 	li	r26,0		/* r26 safe? */
 
+	bl	switch_to_as1
+	mr	r27,r3		/* tlb entry */
 	/* Load each CAM entry */
 1:	mr	r3,r26
 	bl	loadcam_entry
 	addi	r26,r26,1
 	bdnz	1b
+	mr	r3,r27		/* tlb entry */
+	LOAD_REG_ADDR_PIC(r4, memstart_addr)
+	lwz	r4,0(r4)
+	mr	r5,r25		/* phys kernel start */
+	rlwinm	r5,r5,0,~0x3ffffff	/* aligned 64M */
+	subf	r4,r5,r4	/* memstart_addr - phys kernel start */
+	li	r5,0		/* no device tree */
+	li	r6,0		/* not boot cpu */
+	bl	restore_to_as0
+
+
+	lis	r3,__secondary_hold_acknowledge@h
+	ori	r3,r3,__secondary_hold_acknowledge@l
+	stw	r24,0(r3)
+
+	li	r3,0
+	mr	r4,r24		/* Why? */
+	bl	call_setup_cpu
 
 	/* get current_thread_info and current */
 	lis	r1,secondary_ti@ha
@@ -1111,6 +1211,112 @@ __secondary_hold_acknowledge:
 #endif
 
 /*
+ * Create a tlb entry with the same effective and physical address as
+ * the tlb entry used by the current running code. But set the TS to 1.
+ * Then switch to the address space 1. It will return with the r3 set to
+ * the ESEL of the new created tlb.
+ */
+_GLOBAL(switch_to_as1)
+	mflr	r5
+
+	/* Find a entry not used */
+	mfspr	r3,SPRN_TLB1CFG
+	andi.	r3,r3,0xfff
+	mfspr	r4,SPRN_PID
+	rlwinm	r4,r4,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	mtspr	SPRN_MAS6,r4
+1:	lis	r4,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	addi	r3,r3,-1
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbre
+	mfspr	r4,SPRN_MAS1
+	andis.	r4,r4,MAS1_VALID@h
+	bne	1b
+
+	/* Get the tlb entry used by the current running code */
+	bl	0f
+0:	mflr	r4
+	tlbsx	0,r4
+
+	mfspr	r4,SPRN_MAS1
+	ori	r4,r4,MAS1_TS		/* Set the TS = 1 */
+	mtspr	SPRN_MAS1,r4
+
+	mfspr	r4,SPRN_MAS0
+	rlwinm	r4,r4,0,~MAS0_ESEL_MASK
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbwe
+	isync
+	sync
+
+	mfmsr	r4
+	ori	r4,r4,MSR_IS | MSR_DS
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r4
+	sync
+	rfi
+
+/*
+ * Restore to the address space 0 and also invalidate the tlb entry created
+ * by switch_to_as1.
+ * r3 - the tlb entry which should be invalidated
+ * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0)
+ * r5 - device tree virtual address. If r4 is 0, r5 is ignored.
+ * r6 - boot cpu
+*/
+_GLOBAL(restore_to_as0)
+	mflr	r0
+
+	bl	0f
+0:	mflr	r9
+	addi	r9,r9,1f - 0b
+
+	/*
+	 * We may map the PAGE_OFFSET in AS0 to a different physical address,
+	 * so we need calculate the right jump and device tree address based
+	 * on the offset passed by r4.
+	 */
+	add	r9,r9,r4
+	add	r5,r5,r4
+	add	r0,r0,r4
+
+2:	mfmsr	r7
+	li	r8,(MSR_IS | MSR_DS)
+	andc	r7,r7,r8
+
+	mtspr	SPRN_SRR0,r9
+	mtspr	SPRN_SRR1,r7
+	sync
+	rfi
+
+	/* Invalidate the temporary tlb entry for AS1 */
+1:	lis	r9,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r9,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r9
+	tlbre
+	mfspr	r9,SPRN_MAS1
+	rlwinm	r9,r9,0,2,31		/* Clear MAS1 Valid and IPPROT */
+	mtspr	SPRN_MAS1,r9
+	tlbwe
+	isync
+
+	cmpwi	r4,0
+	cmpwi	cr1,r6,0
+	cror	eq,4*cr1+eq,eq
+	bne	3f			/* offset != 0 && is_boot_cpu */
+	mtlr	r0
+	blr
+
+	/*
+	 * The PAGE_OFFSET will map to a different physical address,
+	 * jump to _start to do another relocation again.
+	*/
+3:	mr	r3,r5
+	bl	_start
+
+/*
  * We put a few things here that have to be page-aligned. This stuff
  * goes at the beginning of the data segment, which is page-aligned.
  */
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index f0b47d1a6b0e..b0a1792279bb 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -28,7 +28,6 @@
 #include <linux/percpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/hw_breakpoint.h>
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index 847e40e62fce..3fdef0f0c67f 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -84,6 +84,7 @@ _GLOBAL(power7_nap)
 	std	r9,_MSR(r1)
 	std	r1,PACAR1(r13)
 
+_GLOBAL(power7_enter_nap_mode)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	/* Tell KVM we're napping */
 	li	r4,KVM_HWTHREAD_IN_NAP
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 97a3715ac8bd..b82227e7e21b 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -3,7 +3,6 @@
  *
  * (C) Copyright 2004 Linus Torvalds
  */
-#include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/mm.h>
 #include <linux/export.h>
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 572bb5b95f35..88e3ec6e1d96 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -251,14 +251,13 @@ again:
 
 	if (dev)
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << IOMMU_PAGE_SHIFT);
+				      1 << tbl->it_page_shift);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT);
+		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
-	n = iommu_area_alloc(tbl->it_map, limit, start, npages,
-			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
-			     align_mask);
+	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
+			     boundary_size >> tbl->it_page_shift, align_mask);
 	if (n == -1) {
 		if (likely(pass == 0)) {
 			/* First try the pool from the start */
@@ -320,12 +319,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 		return DMA_ERROR_CODE;
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
-	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
+	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
 	build_fail = ppc_md.tce_build(tbl, entry, npages,
-	                              (unsigned long)page & IOMMU_PAGE_MASK,
-	                              direction, attrs);
+				      (unsigned long)page &
+				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
 	/* ppc_md.tce_build() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
@@ -352,7 +351,7 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
 {
 	unsigned long entry, free_entry;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	entry = dma_addr >> tbl->it_page_shift;
 	free_entry = entry - tbl->it_offset;
 
 	if (((free_entry + npages) > tbl->it_size) ||
@@ -401,7 +400,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	unsigned long flags;
 	struct iommu_pool *pool;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	entry = dma_addr >> tbl->it_page_shift;
 	free_entry = entry - tbl->it_offset;
 
 	pool = get_pool(tbl, free_entry);
@@ -468,13 +467,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		}
 		/* Allocate iommu entries for that segment */
 		vaddr = (unsigned long) sg_virt(s);
-		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
 		    (vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
-					  mask >> IOMMU_PAGE_SHIFT, align);
+					  mask >> tbl->it_page_shift, align);
 
 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
 
@@ -489,16 +488,16 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 		/* Convert entry to a dma_addr_t */
 		entry += tbl->it_offset;
-		dma_addr = entry << IOMMU_PAGE_SHIFT;
-		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
+		dma_addr = entry << tbl->it_page_shift;
+		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
 
 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
 		build_fail = ppc_md.tce_build(tbl, entry, npages,
-		                              vaddr & IOMMU_PAGE_MASK,
-		                              direction, attrs);
+					      vaddr & IOMMU_PAGE_MASK(tbl),
+					      direction, attrs);
 		if(unlikely(build_fail))
 			goto failure;
 
@@ -559,9 +558,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s->dma_length != 0) {
 			unsigned long vaddr, npages;
 
-			vaddr = s->dma_address & IOMMU_PAGE_MASK;
+			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
-						 IOMMU_PAGE_SIZE);
+						 IOMMU_PAGE_SIZE(tbl));
 			__iommu_free(tbl, vaddr, npages);
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
@@ -592,7 +591,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		if (sg->dma_length == 0)
 			break;
 		npages = iommu_num_pages(dma_handle, sg->dma_length,
-					 IOMMU_PAGE_SIZE);
+					 IOMMU_PAGE_SIZE(tbl));
 		__iommu_free(tbl, dma_handle, npages);
 		sg = sg_next(sg);
 	}
@@ -676,7 +675,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 		set_bit(0, tbl->it_map);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
-	if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
+	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
 		tbl->nr_pools = IOMMU_NR_POOLS;
 	else
 		tbl->nr_pools = 1;
@@ -768,16 +767,16 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 
 	vaddr = page_address(page) + offset;
 	uaddr = (unsigned long)vaddr;
-	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE);
+	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
 
 	if (tbl) {
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 
 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
-					 mask >> IOMMU_PAGE_SHIFT, align,
+					 mask >> tbl->it_page_shift, align,
 					 attrs);
 		if (dma_handle == DMA_ERROR_CODE) {
 			if (printk_ratelimit())  {
@@ -786,7 +785,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 					 npages);
 			}
 		} else
-			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
+			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
 	}
 
 	return dma_handle;
@@ -801,7 +800,8 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
 	BUG_ON(direction == DMA_NONE);
 
 	if (tbl) {
-		npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(dma_handle, size,
+					 IOMMU_PAGE_SIZE(tbl));
 		iommu_free(tbl, dma_handle, npages);
 	}
 }
@@ -845,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	nio_pages = size >> IOMMU_PAGE_SHIFT;
-	io_order = get_iommu_order(size);
+	nio_pages = size >> tbl->it_page_shift;
+	io_order = get_iommu_order(size, tbl);
 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
-			      mask >> IOMMU_PAGE_SHIFT, io_order, NULL);
+			      mask >> tbl->it_page_shift, io_order, NULL);
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
 		return NULL;
@@ -864,7 +864,7 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
-		nio_pages = size >> IOMMU_PAGE_SHIFT;
+		nio_pages = size >> tbl->it_page_shift;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
@@ -935,10 +935,10 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 	if (tce_value)
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK)
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT;
+	ioba >>= tbl->it_page_shift;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -955,13 +955,13 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
 		return -EINVAL;
 
-	if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK)
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT;
+	ioba >>= tbl->it_page_shift;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -1037,7 +1037,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
-				__func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
+			__func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl),
 				hwaddr, ret); */
 
 	return ret;
@@ -1049,14 +1049,14 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 {
 	int ret;
 	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
 			direction != DMA_TO_DEVICE, &page);
 	if (unlikely(ret != 1)) {
 		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << IOMMU_PAGE_SHIFT, ret); */
+				tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */
 		return -EFAULT;
 	}
 	hwaddr = (unsigned long) page_address(page) + offset;
@@ -1067,7 +1067,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 
 	if (ret < 0)
 		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-				__func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
+			__func__, entry << tbl->it_page_shift, tce, ret);
 
 	return ret;
 }
@@ -1088,6 +1088,14 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	memset(tbl->it_map, 0xff, sz);
 	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 
+	/*
+	 * Disable iommu bypass, otherwise the user can DMA to all of
+	 * our physical memory via the bypass window instead of just
+	 * the pages that has been explicitly mapped into the iommu
+	 */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, false);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1102,10 +1110,14 @@ void iommu_release_ownership(struct iommu_table *tbl)
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
+
+	/* The kernel owns the device now, we can restore the iommu bypass */
+	if (tbl->set_bypass)
+		tbl->set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-static int iommu_add_device(struct device *dev)
+int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
 	int ret = 0;
@@ -1127,6 +1139,12 @@ static int iommu_add_device(struct device *dev)
 	pr_debug("iommu_tce: adding %s to iommu group %d\n",
 			dev_name(dev), iommu_group_id(tbl->it_group));
 
+	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
+		pr_err("iommu_tce: unsupported iommu page size.");
+		pr_err("%s has not been added\n", dev_name(dev));
+		return -EINVAL;
+	}
+
 	ret = iommu_group_add_device(tbl->it_group, dev);
 	if (ret < 0)
 		pr_err("iommu_tce: %s has not been added, ret=%d\n",
@@ -1134,52 +1152,23 @@ static int iommu_add_device(struct device *dev)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_add_device);
 
-static void iommu_del_device(struct device *dev)
-{
-	iommu_group_remove_device(dev);
-}
-
-static int iommu_bus_notifier(struct notifier_block *nb,
-			      unsigned long action, void *data)
+void iommu_del_device(struct device *dev)
 {
-	struct device *dev = data;
-
-	switch (action) {
-	case BUS_NOTIFY_ADD_DEVICE:
-		return iommu_add_device(dev);
-	case BUS_NOTIFY_DEL_DEVICE:
-		iommu_del_device(dev);
-		return 0;
-	default:
-		return 0;
+	/*
+	 * Some devices might not have IOMMU table and group
+	 * and we needn't detach them from the associated
+	 * IOMMU groups
+	 */
+	if (!dev->iommu_group) {
+		pr_debug("iommu_tce: skipping device %s with no tbl\n",
+			 dev_name(dev));
+		return;
 	}
-}
-
-static struct notifier_block tce_iommu_bus_nb = {
-	.notifier_call = iommu_bus_notifier,
-};
-
-static int __init tce_iommu_init(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
-
-	for_each_pci_dev(pdev)
-		iommu_add_device(&pdev->dev);
-
-	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
-	return 0;
-}
-
-subsys_initcall_sync(tce_iommu_init);
-
-#else
 
-void iommu_register_group(struct iommu_table *tbl,
-		int pci_domain_number, unsigned long pe_num)
-{
+	iommu_group_remove_device(dev);
 }
+EXPORT_SYMBOL_GPL(iommu_del_device);
 
 #endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ba0165615215..1d0848bba049 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -354,8 +354,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 
 	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs);
-        seq_printf(p, "  Local timer interrupts\n");
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event);
+        seq_printf(p, "  Local timer interrupts for timer event device\n");
+
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others);
+        seq_printf(p, "  Local timer interrupts for others\n");
 
 	seq_printf(p, "%*s: ", prec, "SPU");
 	for_each_online_cpu(j)
@@ -389,11 +394,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
  */
 u64 arch_irq_stat_cpu(unsigned int cpu)
 {
-	u64 sum = per_cpu(irq_stat, cpu).timer_irqs;
+	u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event;
 
 	sum += per_cpu(irq_stat, cpu).pmu_irqs;
 	sum += per_cpu(irq_stat, cpu).mce_exceptions;
 	sum += per_cpu(irq_stat, cpu).spurious_irqs;
+	sum += per_cpu(irq_stat, cpu).timer_irqs_others;
 #ifdef CONFIG_PPC_DOORBELL
 	sum += per_cpu(irq_stat, cpu).doorbell_irqs;
 #endif
@@ -553,8 +559,13 @@ void exc_lvl_ctx_init(void)
 #ifdef CONFIG_PPC64
 		cpu_nr = i;
 #else
+#ifdef CONFIG_SMP
 		cpu_nr = get_hard_smp_processor_id(i);
+#else
+		cpu_nr = 0;
+#endif
 #endif
+
 		memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
 		tp = critirq_ctx[cpu_nr];
 		tp->cpu = cpu_nr;
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 83e89d310734..8504657379f1 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index db28032e320e..6a0175297b0d 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -413,13 +413,13 @@ static void kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
 
-	ulong in[8];
+	ulong in[8] = {0};
 	ulong out[8];
 
 	in[0] = KVM_MAGIC_PAGE;
 	in[1] = KVM_MAGIC_PAGE;
 
-	kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
+	epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
 	*features = out[0];
 }
@@ -711,43 +711,6 @@ static void kvm_use_magic_page(void)
 			 kvm_patching_worked ? "worked" : "failed");
 }
 
-unsigned long kvm_hypercall(unsigned long *in,
-			    unsigned long *out,
-			    unsigned long nr)
-{
-	unsigned long register r0 asm("r0");
-	unsigned long register r3 asm("r3") = in[0];
-	unsigned long register r4 asm("r4") = in[1];
-	unsigned long register r5 asm("r5") = in[2];
-	unsigned long register r6 asm("r6") = in[3];
-	unsigned long register r7 asm("r7") = in[4];
-	unsigned long register r8 asm("r8") = in[5];
-	unsigned long register r9 asm("r9") = in[6];
-	unsigned long register r10 asm("r10") = in[7];
-	unsigned long register r11 asm("r11") = nr;
-	unsigned long register r12 asm("r12");
-
-	asm volatile("bl	epapr_hypercall_start"
-		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
-		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
-		       "=r"(r12)
-		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
-		       "r"(r9), "r"(r10), "r"(r11)
-		     : "memory", "cc", "xer", "ctr", "lr");
-
-	out[0] = r4;
-	out[1] = r5;
-	out[2] = r6;
-	out[3] = r7;
-	out[4] = r8;
-	out[5] = r9;
-	out[6] = r10;
-	out[7] = r11;
-
-	return r3;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
 static __init void kvm_free_tmp(void)
 {
 	free_reserved_area(&kvm_tmp[kvm_tmp_index],
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 75d4f7340da8..015ae55c1868 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -196,7 +196,9 @@ int overlaps_crashkernel(unsigned long start, unsigned long size)
 
 /* Values we need to export to the second kernel via the device tree. */
 static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
 static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
 
 static struct property kernel_end_prop = {
 	.name = "linux,kernel-end",
@@ -207,7 +209,7 @@ static struct property kernel_end_prop = {
 static struct property crashk_base_prop = {
 	.name = "linux,crashkernel-base",
 	.length = sizeof(phys_addr_t),
-	.value = &crashk_res.start,
+	.value = &crashk_base
 };
 
 static struct property crashk_size_prop = {
@@ -219,9 +221,11 @@ static struct property crashk_size_prop = {
 static struct property memory_limit_prop = {
 	.name = "linux,memory-limit",
 	.length = sizeof(unsigned long long),
-	.value = &memory_limit,
+	.value = &mem_limit,
 };
 
+#define cpu_to_be_ulong	__PASTE(cpu_to_be, BITS_PER_LONG)
+
 static void __init export_crashk_values(struct device_node *node)
 {
 	struct property *prop;
@@ -237,8 +241,9 @@ static void __init export_crashk_values(struct device_node *node)
 		of_remove_property(node, prop);
 
 	if (crashk_res.start != 0) {
+		crashk_base = cpu_to_be_ulong(crashk_res.start),
 		of_add_property(node, &crashk_base_prop);
-		crashk_size = resource_size(&crashk_res);
+		crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
 		of_add_property(node, &crashk_size_prop);
 	}
 
@@ -246,6 +251,7 @@ static void __init export_crashk_values(struct device_node *node)
 	 * memory_limit is required by the kexec-tools to limit the
 	 * crash regions to the actual memory used.
 	 */
+	mem_limit = cpu_to_be_ulong(memory_limit);
 	of_update_property(node, &memory_limit_prop);
 }
 
@@ -264,7 +270,7 @@ static int __init kexec_setup(void)
 		of_remove_property(node, prop);
 
 	/* information needed by userspace when using default_machine_kexec */
-	kernel_end = __pa(_end);
+	kernel_end = cpu_to_be_ulong(__pa(_end));
 	of_add_property(node, &kernel_end_prop);
 
 	export_crashk_values(node);
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index be4e6d648f60..59d229a2a3e0 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -369,6 +369,7 @@ void default_machine_kexec(struct kimage *image)
 
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
+static unsigned long htab_size;
 
 static struct property htab_base_prop = {
 	.name = "linux,htab-base",
@@ -379,7 +380,7 @@ static struct property htab_base_prop = {
 static struct property htab_size_prop = {
 	.name = "linux,htab-size",
 	.length = sizeof(unsigned long),
-	.value = &htab_size_bytes,
+	.value = &htab_size,
 };
 
 static int __init export_htab_values(void)
@@ -403,8 +404,9 @@ static int __init export_htab_values(void)
 	if (prop)
 		of_remove_property(node, prop);
 
-	htab_base = __pa(htab_address);
+	htab_base = cpu_to_be64(__pa(htab_address));
 	of_add_property(node, &htab_base_prop);
+	htab_size = cpu_to_be64(htab_size_bytes);
 	of_add_property(node, &htab_size_prop);
 
 	of_node_put(node);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
new file mode 100644
index 000000000000..cadef7e64e42
--- /dev/null
+++ b/arch/powerpc/kernel/mce.c
@@ -0,0 +1,352 @@
+/*
+ * Machine check exception handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/irq_work.h>
+#include <asm/mce.h>
+
+static DEFINE_PER_CPU(int, mce_nest_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
+
+/* Queue for delayed MCE events. */
+static DEFINE_PER_CPU(int, mce_queue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+
+static void machine_check_process_queued_event(struct irq_work *work);
+struct irq_work mce_event_process_work = {
+        .func = machine_check_process_queued_event,
+};
+
+static void mce_set_error_info(struct machine_check_event *mce,
+			       struct mce_error_info *mce_err)
+{
+	mce->error_type = mce_err->error_type;
+	switch (mce_err->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
+		break;
+	case MCE_ERROR_TYPE_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * Decode and save high level MCE information into per cpu buffer which
+ * is an array of machine_check_event structure.
+ */
+void save_mce_event(struct pt_regs *regs, long handled,
+		    struct mce_error_info *mce_err,
+		    uint64_t addr)
+{
+	uint64_t srr1;
+	int index = __get_cpu_var(mce_nest_count)++;
+	struct machine_check_event *mce = &__get_cpu_var(mce_event[index]);
+
+	/*
+	 * Return if we don't have enough space to log mce event.
+	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
+	 * the check below will stop buffer overrun.
+	 */
+	if (index >= MAX_MC_EVT)
+		return;
+
+	/* Populate generic machine check info */
+	mce->version = MCE_V1;
+	mce->srr0 = regs->nip;
+	mce->srr1 = regs->msr;
+	mce->gpr3 = regs->gpr[3];
+	mce->in_use = 1;
+
+	mce->initiator = MCE_INITIATOR_CPU;
+	if (handled)
+		mce->disposition = MCE_DISPOSITION_RECOVERED;
+	else
+		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
+	mce->severity = MCE_SEV_ERROR_SYNC;
+
+	srr1 = regs->msr;
+
+	/*
+	 * Populate the mce error_type and type-specific error_type.
+	 */
+	mce_set_error_info(mce, mce_err);
+
+	if (!addr)
+		return;
+
+	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
+		mce->u.tlb_error.effective_address_provided = true;
+		mce->u.tlb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
+		mce->u.slb_error.effective_address_provided = true;
+		mce->u.slb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
+		mce->u.erat_error.effective_address_provided = true;
+		mce->u.erat_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+		mce->u.ue_error.effective_address_provided = true;
+		mce->u.ue_error.effective_address = addr;
+	}
+	return;
+}
+
+/*
+ * get_mce_event:
+ *	mce	Pointer to machine_check_event structure to be filled.
+ *	release Flag to indicate whether to free the event slot or not.
+ *		0 <= do not release the mce event. Caller will invoke
+ *		     release_mce_event() once event has been consumed.
+ *		1 <= release the slot.
+ *
+ *	return	1 = success
+ *		0 = failure
+ *
+ * get_mce_event() will be called by platform specific machine check
+ * handle routine and in KVM.
+ * When we call get_mce_event(), we are still in interrupt context and
+ * preemption will not be scheduled until ret_from_expect() routine
+ * is called.
+ */
+int get_mce_event(struct machine_check_event *mce, bool release)
+{
+	int index = __get_cpu_var(mce_nest_count) - 1;
+	struct machine_check_event *mc_evt;
+	int ret = 0;
+
+	/* Sanity check */
+	if (index < 0)
+		return ret;
+
+	/* Check if we have MCE info to process. */
+	if (index < MAX_MC_EVT) {
+		mc_evt = &__get_cpu_var(mce_event[index]);
+		/* Copy the event structure and release the original */
+		if (mce)
+			*mce = *mc_evt;
+		if (release)
+			mc_evt->in_use = 0;
+		ret = 1;
+	}
+	/* Decrement the count to free the slot. */
+	if (release)
+		__get_cpu_var(mce_nest_count)--;
+
+	return ret;
+}
+
+void release_mce_event(void)
+{
+	get_mce_event(NULL, true);
+}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_queue_event(void)
+{
+	int index;
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return;
+
+	index = __get_cpu_var(mce_queue_count)++;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__get_cpu_var(mce_queue_count)--;
+		return;
+	}
+	__get_cpu_var(mce_event_queue[index]) = evt;
+
+	/* Queue irq work to process this event later. */
+	irq_work_queue(&mce_event_process_work);
+}
+
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_check_process_queued_event(struct irq_work *work)
+{
+	int index;
+
+	/*
+	 * For now just print it to console.
+	 * TODO: log this error event to FSP or nvram.
+	 */
+	while (__get_cpu_var(mce_queue_count) > 0) {
+		index = __get_cpu_var(mce_queue_count) - 1;
+		machine_check_print_event_info(
+				&__get_cpu_var(mce_event_queue[index]));
+		__get_cpu_var(mce_queue_count)--;
+	}
+}
+
+void machine_check_print_event_info(struct machine_check_event *evt)
+{
+	const char *level, *sevstr, *subtype;
+	static const char *mc_ue_types[] = {
+		"Indeterminate",
+		"Instruction fetch",
+		"Page table walk ifetch",
+		"Load/Store",
+		"Page table walk Load/Store",
+	};
+	static const char *mc_slb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_erat_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_tlb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+
+	/* Print things out */
+	if (evt->version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt->version);
+		return;
+	}
+	switch (evt->severity) {
+	case MCE_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case MCE_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case MCE_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case MCE_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
+	       "Recovered" : "[Not recovered");
+	printk("%s  Initiator: %s\n", level,
+	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		subtype = evt->u.ue_error.ue_error_type <
+			ARRAY_SIZE(mc_ue_types) ?
+			mc_ue_types[evt->u.ue_error.ue_error_type]
+			: "Unknown";
+		printk("%s  Error type: UE [%s]\n", level, subtype);
+		if (evt->u.ue_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.ue_error.effective_address);
+		if (evt->u.ue_error.physical_address_provided)
+			printk("%s      Physial address: %016llx\n",
+			       level, evt->u.ue_error.physical_address);
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		subtype = evt->u.slb_error.slb_error_type <
+			ARRAY_SIZE(mc_slb_types) ?
+			mc_slb_types[evt->u.slb_error.slb_error_type]
+			: "Unknown";
+		printk("%s  Error type: SLB [%s]\n", level, subtype);
+		if (evt->u.slb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.slb_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		subtype = evt->u.erat_error.erat_error_type <
+			ARRAY_SIZE(mc_erat_types) ?
+			mc_erat_types[evt->u.erat_error.erat_error_type]
+			: "Unknown";
+		printk("%s  Error type: ERAT [%s]\n", level, subtype);
+		if (evt->u.erat_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.erat_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		subtype = evt->u.tlb_error.tlb_error_type <
+			ARRAY_SIZE(mc_tlb_types) ?
+			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
+			: "Unknown";
+		printk("%s  Error type: TLB [%s]\n", level, subtype);
+		if (evt->u.tlb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.tlb_error.effective_address);
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		printk("%s  Error type: Unknown\n", level);
+		break;
+	}
+}
+
+uint64_t get_mce_fault_addr(struct machine_check_event *evt)
+{
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		if (evt->u.ue_error.effective_address_provided)
+			return evt->u.ue_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		if (evt->u.slb_error.effective_address_provided)
+			return evt->u.slb_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		if (evt->u.erat_error.effective_address_provided)
+			return evt->u.erat_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		if (evt->u.tlb_error.effective_address_provided)
+			return evt->u.tlb_error.effective_address;
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(get_mce_fault_addr);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
new file mode 100644
index 000000000000..27c93f41166f
--- /dev/null
+++ b/arch/powerpc/kernel/mce_power.c
@@ -0,0 +1,284 @@
+/*
+ * Machine check exception handling CPU-side for power7 and power8
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce_power: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/mce.h>
+
+/* flush SLBs and reload */
+static void flush_and_reload_slb(void)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* Invalidate all SLBs */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+	/*
+	 * If machine check is hit when in guest or in transition, we will
+	 * only flush the SLBs and continue.
+	 */
+	if (get_paca()->kvm_hstate.in_guest)
+		return;
+#endif
+
+	/* For host kernel, reload the SLBs from shadow SLB buffer. */
+	slb = get_slb_shadow();
+	if (!slb)
+		return;
+
+	n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
+
+	/* Load up the SLB entries from shadow SLB */
+	for (i = 0; i < n; i++) {
+		unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+		unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+
+		rb = (rb & ~0xFFFul) | i;
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+{
+	long handled = 1;
+
+	/*
+	 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
+	 * reset the error bits whenever we handle them so that at the end
+	 * we can check whether we handled all of them or not.
+	 * */
+	if (dsisr & slb_error_bits) {
+		flush_and_reload_slb();
+		/* reset error bits */
+		dsisr &= ~(slb_error_bits);
+	}
+	if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+		/* reset error bits */
+		dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+	}
+	/* Any other errors we don't understand? */
+	if (dsisr & 0xffffffffUL)
+		handled = 0;
+
+	return handled;
+}
+
+static long mce_handle_derror_p7(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+}
+
+static long mce_handle_common_ierror(uint64_t srr1)
+{
+	long handled = 0;
+
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case 0:
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		/* flush and reload SLBs for SLB errors. */
+		flush_and_reload_slb();
+		handled = 1;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+			handled = 1;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return handled;
+}
+
+static long mce_handle_ierror_p7(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_UE:
+	case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
+		break;
+	case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+		break;
+	}
+}
+
+static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	if (dsisr & P7_DSISR_MC_UE) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+	} else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
+{
+	uint64_t srr1, addr;
+	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
+
+	srr1 = regs->msr;
+
+	/*
+	 * Handle memory errors depending whether this was a load/store or
+	 * ifetch exception. Also, populate the mce error_type and
+	 * type-specific error_type from either SRR1 or DSISR, depending
+	 * whether this was a load/store or ifetch exception
+	 */
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
+		handled = mce_handle_derror_p7(regs->dsisr);
+		mce_get_derror_p7(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
+		handled = mce_handle_ierror_p7(srr1);
+		mce_get_ierror_p7(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
+
+	save_mce_event(regs, handled, &mce_error_info, addr);
+	return handled;
+}
+
+static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
+static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	mce_get_derror_p7(mce_err, dsisr);
+	if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
+static long mce_handle_ierror_p8(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+static long mce_handle_derror_p8(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
+}
+
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
+{
+	uint64_t srr1, addr;
+	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
+
+	srr1 = regs->msr;
+
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
+		handled = mce_handle_derror_p8(regs->dsisr);
+		mce_get_derror_p8(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
+		handled = mce_handle_ierror_p8(srr1);
+		mce_get_ierror_p8(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
+
+	save_mce_event(regs, handled, &mce_error_info, addr);
+	return handled;
+}
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index e47d268727a4..879f09620f83 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -344,7 +344,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
  */
 _KPROBE(flush_icache_range)
 BEGIN_FTR_SECTION
-	isync
+	PURGE_PREFETCHED_INS
 	blr				/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	li	r5,L1_CACHE_BYTES-1
@@ -448,6 +448,7 @@ _GLOBAL(invalidate_dcache_range)
  */
 _GLOBAL(__flush_dcache_icache)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
@@ -489,6 +490,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
  */
 _GLOBAL(__flush_dcache_icache_phys)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr					/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	mfmsr	r10
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 64bf8db12b15..3d0249599d52 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -67,6 +67,7 @@ PPC64_CACHES:
 
 _KPROBE(flush_icache_range)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 /*
@@ -211,6 +212,11 @@ _GLOBAL(__flush_dcache_icache)
  * Different systems have different cache line sizes
  */
 
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+
 /* Flush the dcache */
  	ld	r7,PPC64_CACHES@toc(r2)
 	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 0620eaaaad45..bf0aada02fe4 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -99,12 +99,28 @@ static inline void free_lppacas(void) { }
  * 3 persistent SLBs are registered here.  The buffer will be zero
  * initially, hence will all be invaild until we actually write them.
  */
-struct slb_shadow slb_shadow[] __cacheline_aligned = {
-	[0 ... (NR_CPUS-1)] = {
-		.persistent = cpu_to_be32(SLB_NUM_BOLTED),
-		.buffer_length = cpu_to_be32(sizeof(struct slb_shadow)),
-	},
-};
+static struct slb_shadow *slb_shadow;
+
+static void __init allocate_slb_shadows(int nr_cpus, int limit)
+{
+	int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus);
+	slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit));
+	memset(slb_shadow, 0, size);
+}
+
+static struct slb_shadow * __init init_slb_shadow(int cpu)
+{
+	struct slb_shadow *s = &slb_shadow[cpu];
+
+	s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
+	s->buffer_length = cpu_to_be32(sizeof(*s));
+
+	return s;
+}
+
+#else /* CONFIG_PPC_STD_MMU_64 */
+
+static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
 
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
@@ -142,8 +158,13 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 	new_paca->__current = &init_task;
 	new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
 #ifdef CONFIG_PPC_STD_MMU_64
-	new_paca->slb_shadow_ptr = &slb_shadow[cpu];
+	new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
 #endif /* CONFIG_PPC_STD_MMU_64 */
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* For now -- if we have threads this will be adjusted later */
+	new_paca->tcd_ptr = &new_paca->tcd;
+#endif
 }
 
 /* Put the paca pointer into r13 and SPRG_PACA */
@@ -190,6 +211,8 @@ void __init allocate_pacas(void)
 
 	allocate_lppacas(nr_cpu_ids, limit);
 
+	allocate_slb_shadows(nr_cpu_ids, limit);
+
 	/* Can't use for_each_*_cpu, as they aren't functional yet */
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		initialise_paca(&paca[cpu], cpu);
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index a1e3e40ca3fd..d9476c1fc959 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -835,7 +835,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
 		 * at 0 as unset as well, except if PCI_PROBE_ONLY is also set
 		 * since in that case, we don't want to re-assign anything
 		 */
-		pcibios_resource_to_bus(dev, &reg, res);
+		pcibios_resource_to_bus(dev->bus, &reg, res);
 		if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) ||
 		    (reg.start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) {
 			/* Only print message if not re-assigning */
@@ -886,7 +886,7 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
 
 	/* Job is a bit different between memory and IO */
 	if (res->flags & IORESOURCE_MEM) {
-		pcibios_resource_to_bus(dev, &region, res);
+		pcibios_resource_to_bus(dev->bus, &region, res);
 
 		/* If the BAR is non-0 then it's probably been initialized */
 		if (region.start != 0)
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index ac0b034f9ae0..83c26d829991 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -111,7 +111,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
 		res->name = pci_name(dev);
 		region.start = base;
 		region.end = base + size - 1;
-		pcibios_bus_to_resource(dev, res, &region);
+		pcibios_bus_to_resource(dev->bus, res, &region);
 	}
 }
 
@@ -280,7 +280,7 @@ void of_scan_pci_bridge(struct pci_dev *dev)
 		res->flags = flags;
 		region.start = of_read_number(&ranges[1], 2);
 		region.end = region.start + size - 1;
-		pcibios_bus_to_resource(dev, res, &region);
+		pcibios_bus_to_resource(dev->bus, res, &region);
 	}
 	sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus),
 		bus->number);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3386d8ab7eb0..8d4c247f1738 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/elf.h>
-#include <linux/init.h>
 #include <linux/prctl.h>
 #include <linux/init_task.h>
 #include <linux/export.h>
@@ -74,6 +73,48 @@ struct task_struct *last_task_used_vsx = NULL;
 struct task_struct *last_task_used_spe = NULL;
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void giveup_fpu_maybe_transactional(struct task_struct *tsk)
+{
+	/*
+	 * If we are saving the current thread's registers, and the
+	 * thread is in a transactional state, set the TIF_RESTORE_TM
+	 * bit so that we know to restore the registers before
+	 * returning to userspace.
+	 */
+	if (tsk == current && tsk->thread.regs &&
+	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+	    !test_thread_flag(TIF_RESTORE_TM)) {
+		tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+		set_thread_flag(TIF_RESTORE_TM);
+	}
+
+	giveup_fpu(tsk);
+}
+
+void giveup_altivec_maybe_transactional(struct task_struct *tsk)
+{
+	/*
+	 * If we are saving the current thread's registers, and the
+	 * thread is in a transactional state, set the TIF_RESTORE_TM
+	 * bit so that we know to restore the registers before
+	 * returning to userspace.
+	 */
+	if (tsk == current && tsk->thread.regs &&
+	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+	    !test_thread_flag(TIF_RESTORE_TM)) {
+		tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+		set_thread_flag(TIF_RESTORE_TM);
+	}
+
+	giveup_altivec(tsk);
+}
+
+#else
+#define giveup_fpu_maybe_transactional(tsk)	giveup_fpu(tsk)
+#define giveup_altivec_maybe_transactional(tsk)	giveup_altivec(tsk)
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
 #ifdef CONFIG_PPC_FPU
 /*
  * Make sure the floating-point register state in the
@@ -102,13 +143,13 @@ void flush_fp_to_thread(struct task_struct *tsk)
 			 */
 			BUG_ON(tsk != current);
 #endif
-			giveup_fpu(tsk);
+			giveup_fpu_maybe_transactional(tsk);
 		}
 		preempt_enable();
 	}
 }
 EXPORT_SYMBOL_GPL(flush_fp_to_thread);
-#endif
+#endif /* CONFIG_PPC_FPU */
 
 void enable_kernel_fp(void)
 {
@@ -116,11 +157,11 @@ void enable_kernel_fp(void)
 
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
-		giveup_fpu(current);
+		giveup_fpu_maybe_transactional(current);
 	else
 		giveup_fpu(NULL);	/* just enables FP for kernel */
 #else
-	giveup_fpu(last_task_used_math);
+	giveup_fpu_maybe_transactional(last_task_used_math);
 #endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_fp);
@@ -132,11 +173,11 @@ void enable_kernel_altivec(void)
 
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
-		giveup_altivec(current);
+		giveup_altivec_maybe_transactional(current);
 	else
 		giveup_altivec_notask();
 #else
-	giveup_altivec(last_task_used_altivec);
+	giveup_altivec_maybe_transactional(last_task_used_altivec);
 #endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
@@ -153,7 +194,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 #ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
 #endif
-			giveup_altivec(tsk);
+			giveup_altivec_maybe_transactional(tsk);
 		}
 		preempt_enable();
 	}
@@ -182,8 +223,8 @@ EXPORT_SYMBOL(enable_kernel_vsx);
 
 void giveup_vsx(struct task_struct *tsk)
 {
-	giveup_fpu(tsk);
-	giveup_altivec(tsk);
+	giveup_fpu_maybe_transactional(tsk);
+	giveup_altivec_maybe_transactional(tsk);
 	__giveup_vsx(tsk);
 }
 
@@ -339,7 +380,7 @@ static void set_debug_reg_defaults(struct thread_struct *thread)
 #endif
 }
 
-static void prime_debug_regs(struct thread_struct *thread)
+static void prime_debug_regs(struct debug_reg *debug)
 {
 	/*
 	 * We could have inherited MSR_DE from userspace, since
@@ -348,22 +389,22 @@ static void prime_debug_regs(struct thread_struct *thread)
 	 */
 	mtmsr(mfmsr() & ~MSR_DE);
 
-	mtspr(SPRN_IAC1, thread->debug.iac1);
-	mtspr(SPRN_IAC2, thread->debug.iac2);
+	mtspr(SPRN_IAC1, debug->iac1);
+	mtspr(SPRN_IAC2, debug->iac2);
 #if CONFIG_PPC_ADV_DEBUG_IACS > 2
-	mtspr(SPRN_IAC3, thread->debug.iac3);
-	mtspr(SPRN_IAC4, thread->debug.iac4);
+	mtspr(SPRN_IAC3, debug->iac3);
+	mtspr(SPRN_IAC4, debug->iac4);
 #endif
-	mtspr(SPRN_DAC1, thread->debug.dac1);
-	mtspr(SPRN_DAC2, thread->debug.dac2);
+	mtspr(SPRN_DAC1, debug->dac1);
+	mtspr(SPRN_DAC2, debug->dac2);
 #if CONFIG_PPC_ADV_DEBUG_DVCS > 0
-	mtspr(SPRN_DVC1, thread->debug.dvc1);
-	mtspr(SPRN_DVC2, thread->debug.dvc2);
+	mtspr(SPRN_DVC1, debug->dvc1);
+	mtspr(SPRN_DVC2, debug->dvc2);
 #endif
-	mtspr(SPRN_DBCR0, thread->debug.dbcr0);
-	mtspr(SPRN_DBCR1, thread->debug.dbcr1);
+	mtspr(SPRN_DBCR0, debug->dbcr0);
+	mtspr(SPRN_DBCR1, debug->dbcr1);
 #ifdef CONFIG_BOOKE
-	mtspr(SPRN_DBCR2, thread->debug.dbcr2);
+	mtspr(SPRN_DBCR2, debug->dbcr2);
 #endif
 }
 /*
@@ -371,11 +412,11 @@ static void prime_debug_regs(struct thread_struct *thread)
  * debug registers, set the debug registers from the values
  * stored in the new thread.
  */
-void switch_booke_debug_regs(struct thread_struct *new_thread)
+void switch_booke_debug_regs(struct debug_reg *new_debug)
 {
 	if ((current->thread.debug.dbcr0 & DBCR0_IDM)
-		|| (new_thread->debug.dbcr0 & DBCR0_IDM))
-			prime_debug_regs(new_thread);
+		|| (new_debug->dbcr0 & DBCR0_IDM))
+			prime_debug_regs(new_debug);
 }
 EXPORT_SYMBOL_GPL(switch_booke_debug_regs);
 #else	/* !CONFIG_PPC_ADV_DEBUG_REGS */
@@ -479,7 +520,48 @@ static inline bool hw_brk_match(struct arch_hw_breakpoint *a,
 		return false;
 	return true;
 }
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static void tm_reclaim_thread(struct thread_struct *thr,
+			      struct thread_info *ti, uint8_t cause)
+{
+	unsigned long msr_diff = 0;
+
+	/*
+	 * If FP/VSX registers have been already saved to the
+	 * thread_struct, move them to the transact_fp array.
+	 * We clear the TIF_RESTORE_TM bit since after the reclaim
+	 * the thread will no longer be transactional.
+	 */
+	if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) {
+		msr_diff = thr->tm_orig_msr & ~thr->regs->msr;
+		if (msr_diff & MSR_FP)
+			memcpy(&thr->transact_fp, &thr->fp_state,
+			       sizeof(struct thread_fp_state));
+		if (msr_diff & MSR_VEC)
+			memcpy(&thr->transact_vr, &thr->vr_state,
+			       sizeof(struct thread_vr_state));
+		clear_ti_thread_flag(ti, TIF_RESTORE_TM);
+		msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1;
+	}
+
+	tm_reclaim(thr, thr->regs->msr, cause);
+
+	/* Having done the reclaim, we now have the checkpointed
+	 * FP/VSX values in the registers.  These might be valid
+	 * even if we have previously called enable_kernel_fp() or
+	 * flush_fp_to_thread(), so update thr->regs->msr to
+	 * indicate their current validity.
+	 */
+	thr->regs->msr |= msr_diff;
+}
+
+void tm_reclaim_current(uint8_t cause)
+{
+	tm_enable();
+	tm_reclaim_thread(&current->thread, current_thread_info(), cause);
+}
+
 static inline void tm_reclaim_task(struct task_struct *tsk)
 {
 	/* We have to work out if we're switching from/to a task that's in the
@@ -502,9 +584,11 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 
 	/* Stash the original thread MSR, as giveup_fpu et al will
 	 * modify it.  We hold onto it to see whether the task used
-	 * FP & vector regs.
+	 * FP & vector regs.  If the TIF_RESTORE_TM flag is set,
+	 * tm_orig_msr is already set.
 	 */
-	thr->tm_orig_msr = thr->regs->msr;
+	if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM))
+		thr->tm_orig_msr = thr->regs->msr;
 
 	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
 		 "ccr=%lx, msr=%lx, trap=%lx)\n",
@@ -512,7 +596,7 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 		 thr->regs->ccr, thr->regs->msr,
 		 thr->regs->trap);
 
-	tm_reclaim(thr, thr->regs->msr, TM_CAUSE_RESCHED);
+	tm_reclaim_thread(thr, task_thread_info(tsk), TM_CAUSE_RESCHED);
 
 	TM_DEBUG("--- tm_reclaim on pid %d complete\n",
 		 tsk->pid);
@@ -588,6 +672,43 @@ static inline void __switch_to_tm(struct task_struct *prev)
 		tm_reclaim_task(prev);
 	}
 }
+
+/*
+ * This is called if we are on the way out to userspace and the
+ * TIF_RESTORE_TM flag is set.  It checks if we need to reload
+ * FP and/or vector state and does so if necessary.
+ * If userspace is inside a transaction (whether active or
+ * suspended) and FP/VMX/VSX instructions have ever been enabled
+ * inside that transaction, then we have to keep them enabled
+ * and keep the FP/VMX/VSX state loaded while ever the transaction
+ * continues.  The reason is that if we didn't, and subsequently
+ * got a FP/VMX/VSX unavailable interrupt inside a transaction,
+ * we don't know whether it's the same transaction, and thus we
+ * don't know which of the checkpointed state and the transactional
+ * state to use.
+ */
+void restore_tm_state(struct pt_regs *regs)
+{
+	unsigned long msr_diff;
+
+	clear_thread_flag(TIF_RESTORE_TM);
+	if (!MSR_TM_ACTIVE(regs->msr))
+		return;
+
+	msr_diff = current->thread.tm_orig_msr & ~regs->msr;
+	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
+	if (msr_diff & MSR_FP) {
+		fp_enable();
+		load_fp_state(&current->thread.fp_state);
+		regs->msr |= current->thread.fpexc_mode;
+	}
+	if (msr_diff & MSR_VEC) {
+		vec_enable();
+		load_vr_state(&current->thread.vr_state);
+	}
+	regs->msr |= msr_diff;
+}
+
 #else
 #define tm_recheckpoint_new_task(new)
 #define __switch_to_tm(prev)
@@ -683,14 +804,14 @@ struct task_struct *__switch_to(struct task_struct *prev,
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
-	switch_booke_debug_regs(&new->thread);
+	switch_booke_debug_regs(&new->thread.debug);
 #else
 /*
  * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
  * schedule DABR
  */
 #ifndef CONFIG_HAVE_HW_BREAKPOINT
-	if (unlikely(hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk)))
+	if (unlikely(!hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk)))
 		set_breakpoint(&new->thread.hw_brk);
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif
@@ -1175,6 +1296,19 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
 	if (val & PR_FP_EXC_SW_ENABLE) {
 #ifdef CONFIG_SPE
 		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			tsk->thread.fpexc_mode = val &
 				(PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
 			return 0;
@@ -1206,9 +1340,22 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
 
 	if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
 #ifdef CONFIG_SPE
-		if (cpu_has_feature(CPU_FTR_SPE))
+		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			val = tsk->thread.fpexc_mode;
-		else
+		} else
 			return -EINVAL;
 #else
 		return -EINVAL;
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fa0ad8aafbcc..f58c0d3aaeb4 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -523,6 +523,20 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
 	return early_init_dt_scan_memory(node, uname, depth, data);
 }
 
+/*
+ * For a relocatable kernel, we need to get the memstart_addr first,
+ * then use it to calculate the virtual kernel start address. This has
+ * to happen at a very early stage (before machine_init). In this case,
+ * we just want to get the memstart_address and would not like to mess the
+ * memblock at this stage. So introduce a variable to skip the memblock_add()
+ * for this reason.
+ */
+#ifdef CONFIG_RELOCATABLE
+static int add_mem_to_memblock = 1;
+#else
+#define add_mem_to_memblock 1
+#endif
+
 void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 {
 #ifdef CONFIG_PPC64
@@ -543,7 +557,8 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 	}
 
 	/* Add the chunk to the MEMBLOCK list */
-	memblock_add(base, size);
+	if (add_mem_to_memblock)
+		memblock_add(base, size);
 }
 
 static void __init early_reserve_mem_dt(void)
@@ -740,6 +755,30 @@ void __init early_init_devtree(void *params)
 	DBG(" <- early_init_devtree()\n");
 }
 
+#ifdef CONFIG_RELOCATABLE
+/*
+ * This function run before early_init_devtree, so we have to init
+ * initial_boot_params.
+ */
+void __init early_get_first_memblock_info(void *params, phys_addr_t *size)
+{
+	/* Setup flat device-tree pointer */
+	initial_boot_params = params;
+
+	/*
+	 * Scan the memory nodes and set add_mem_to_memblock to 0 to avoid
+	 * mess the memblock.
+	 */
+	add_mem_to_memblock = 0;
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
+	add_mem_to_memblock = 1;
+
+	if (size)
+		*size = first_memblock_size;
+}
+#endif
+
 /*******
  *
  * New implementation of the OF "find" APIs, return a refcounted
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index cb64a6e1dc51..078145acf7fb 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1986,19 +1986,23 @@ static void __init prom_init_stdout(void)
 	/* Get the full OF pathname of the stdout device */
 	memset(path, 0, 256);
 	call_prom("instance-to-path", 3, 1, prom.stdout, path, 255);
-	stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout);
-	val = cpu_to_be32(stdout_node);
-	prom_setprop(prom.chosen, "/chosen", "linux,stdout-package",
-		     &val, sizeof(val));
 	prom_printf("OF stdout device is: %s\n", of_stdout_device);
 	prom_setprop(prom.chosen, "/chosen", "linux,stdout-path",
 		     path, strlen(path) + 1);
 
-	/* If it's a display, note it */
-	memset(type, 0, sizeof(type));
-	prom_getprop(stdout_node, "device_type", type, sizeof(type));
-	if (strcmp(type, "display") == 0)
-		prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0);
+	/* instance-to-package fails on PA-Semi */
+	stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout);
+	if (stdout_node != PROM_ERROR) {
+		val = cpu_to_be32(stdout_node);
+		prom_setprop(prom.chosen, "/chosen", "linux,stdout-package",
+			     &val, sizeof(val));
+
+		/* If it's a display, note it */
+		memset(type, 0, sizeof(type));
+		prom_getprop(stdout_node, "device_type", type, sizeof(type));
+		if (strcmp(type, "display") == 0)
+			prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0);
+	}
 }
 
 static int __init prom_find_machine_type(void)
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 75fb40498b41..2e3d2bf536c5 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1555,7 +1555,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 			flush_fp_to_thread(child);
 			if (fpidx < (PT_FPSCR - PT_FPR0))
-				memcpy(&tmp, &child->thread.fp_state.fpr,
+				memcpy(&tmp, &child->thread.TS_FPR(fpidx),
 				       sizeof(long));
 			else
 				tmp = child->thread.fp_state.fpscr;
@@ -1588,7 +1588,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 			flush_fp_to_thread(child);
 			if (fpidx < (PT_FPSCR - PT_FPR0))
-				memcpy(&child->thread.fp_state.fpr, &data,
+				memcpy(&child->thread.TS_FPR(fpidx), &data,
 				       sizeof(long));
 			else
 				child->thread.fp_state.fpscr = data;
diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S
index b47a0e1ab001..1482327cfeba 100644
--- a/arch/powerpc/kernel/reloc_64.S
+++ b/arch/powerpc/kernel/reloc_64.S
@@ -69,8 +69,8 @@ _GLOBAL(relocate)
 	 * R_PPC64_RELATIVE ones.
 	 */
 	mtctr	r8
-5:	lwz	r0,12(9)	/* ELF64_R_TYPE(reloc->r_info) */
-	cmpwi	r0,R_PPC64_RELATIVE
+5:	ld	r0,8(9)		/* ELF64_R_TYPE(reloc->r_info) */
+	cmpdi	r0,R_PPC64_RELATIVE
 	bne	6f
 	ld	r6,0(r9)	/* reloc->r_offset */
 	ld	r0,16(r9)	/* reloc->r_addend */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index febc80445d25..bc76cc6b419c 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -479,7 +479,7 @@ void __init smp_setup_cpu_maps(void)
 	if (machine_is(pseries) && firmware_has_feature(FW_FEATURE_LPAR) &&
 	    (dn = of_find_node_by_path("/rtas"))) {
 		int num_addr_cell, num_size_cell, maxcpus;
-		const unsigned int *ireg;
+		const __be32 *ireg;
 
 		num_addr_cell = of_n_addr_cells(dn);
 		num_size_cell = of_n_size_cells(dn);
@@ -489,7 +489,7 @@ void __init smp_setup_cpu_maps(void)
 		if (!ireg)
 			goto out;
 
-		maxcpus = ireg[num_addr_cell + num_size_cell];
+		maxcpus = be32_to_cpup(ireg + num_addr_cell + num_size_cell);
 
 		/* Double maxcpus for processors which have SMT capability */
 		if (cpu_has_feature(CPU_FTR_SMT))
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index b903dc5cf944..04cc4fcca78b 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -247,7 +247,12 @@ static void __init exc_lvl_early_init(void)
 	/* interrupt stacks must be in lowmem, we get that for free on ppc32
 	 * as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */
 	for_each_possible_cpu(i) {
+#ifdef CONFIG_SMP
 		hw_cpu = get_hard_smp_processor_id(i);
+#else
+		hw_cpu = 0;
+#endif
+
 		critirq_ctx[hw_cpu] = (struct thread_info *)
 			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
 #ifdef CONFIG_BOOKE
@@ -296,9 +301,6 @@ void __init setup_arch(char **cmdline_p)
 	if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE))
 		ucache_bsize = icache_bsize = dcache_bsize;
 
-	/* reboot on panic */
-	panic_timeout = 180;
-
 	if (ppc_md.panic)
 		setup_panic();
 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4085aaa9478f..f5f11a7d30e5 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -97,6 +97,36 @@ int dcache_bsize;
 int icache_bsize;
 int ucache_bsize;
 
+#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
+static void setup_tlb_core_data(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int first = cpu_first_thread_sibling(cpu);
+
+		paca[cpu].tcd_ptr = &paca[first].tcd;
+
+		/*
+		 * If we have threads, we need either tlbsrx.
+		 * or e6500 tablewalk mode, or else TLB handlers
+		 * will be racy and could produce duplicate entries.
+		 */
+		if (smt_enabled_at_boot >= 2 &&
+		    !mmu_has_feature(MMU_FTR_USE_TLBRSRV) &&
+		    book3e_htw_mode != PPC_HTW_E6500) {
+			/* Should we panic instead? */
+			WARN_ONCE("%s: unsupported MMU configuration -- expect problems\n",
+				  __func__);
+		}
+	}
+}
+#else
+static void setup_tlb_core_data(void)
+{
+}
+#endif
+
 #ifdef CONFIG_SMP
 
 static char *smt_enabled_cmdline;
@@ -445,6 +475,7 @@ void __init setup_system(void)
 
 	smp_setup_cpu_maps();
 	check_smt_enabled();
+	setup_tlb_core_data();
 
 #ifdef CONFIG_SMP
 	/* Release secondary cpus out of their spinloops at 0x60 now that
@@ -520,9 +551,6 @@ static void __init irqstack_early_init(void)
 #ifdef CONFIG_PPC_BOOK3E
 static void __init exc_lvl_early_init(void)
 {
-	extern unsigned int interrupt_base_book3e;
-	extern unsigned int exc_debug_debug_book3e;
-
 	unsigned int i;
 
 	for_each_possible_cpu(i) {
@@ -535,8 +563,7 @@ static void __init exc_lvl_early_init(void)
 	}
 
 	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
-		patch_branch(&interrupt_base_book3e + (0x040 / 4) + 1,
-			     (unsigned long)&exc_debug_debug_book3e, 0);
+		patch_exception(0x040, exc_debug_debug_book3e);
 }
 #else
 #define exc_lvl_early_init()
@@ -544,7 +571,8 @@ static void __init exc_lvl_early_init(void)
 
 /*
  * Stack space used when we detect a bad kernel stack pointer, and
- * early in SMP boots before relocation is enabled.
+ * early in SMP boots before relocation is enabled. Exclusive emergency
+ * stack for machine checks.
  */
 static void __init emergency_stack_init(void)
 {
@@ -567,6 +595,13 @@ static void __init emergency_stack_init(void)
 		sp  = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
 		sp += THREAD_SIZE;
 		paca[i].emergency_sp = __va(sp);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+		/* emergency stack for machine check exception handling. */
+		sp  = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
+		sp += THREAD_SIZE;
+		paca[i].mc_emergency_sp = __va(sp);
+#endif
 	}
 }
 
@@ -588,9 +623,6 @@ void __init setup_arch(char **cmdline_p)
 	dcache_bsize = ppc64_caches.dline_size;
 	icache_bsize = ppc64_caches.iline_size;
 
-	/* reboot on panic */
-	panic_timeout = 180;
-
 	if (ppc_md.panic)
 		setup_panic();
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 457e97aa2945..8fc4177ed65a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -203,8 +203,7 @@ unsigned long get_tm_stackpointer(struct pt_regs *regs)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (MSR_TM_ACTIVE(regs->msr)) {
-		tm_enable();
-		tm_reclaim(&current->thread, regs->msr, TM_CAUSE_SIGNAL);
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
 		if (MSR_TM_TRANSACTIONAL(regs->msr))
 			return current->thread.ckpt_regs.gpr[1];
 	}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 68027bfa5f8e..a67e00aa3caa 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -519,6 +519,13 @@ static int save_tm_user_regs(struct pt_regs *regs,
 {
 	unsigned long msr = regs->msr;
 
+	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
+	 * just indicates to userland that we were doing a transaction, but we
+	 * don't want to return in transactional state.  This also ensures
+	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+	 */
+	regs->msr &= ~MSR_TS_MASK;
+
 	/* Make sure floating point registers are stored in regs */
 	flush_fp_to_thread(current);
 
@@ -1015,29 +1022,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_frame = &rt_sf->uc_transact.uc_mcontext;
 	if (MSR_TM_ACTIVE(regs->msr)) {
+		if (__put_user((unsigned long)&rt_sf->uc_transact,
+			       &rt_sf->uc.uc_link) ||
+		    __put_user((unsigned long)tm_frame,
+			       &rt_sf->uc_transact.uc_regs))
+			goto badframe;
 		if (save_tm_user_regs(regs, frame, tm_frame, sigret))
 			goto badframe;
 	}
 	else
 #endif
 	{
+		if (__put_user(0, &rt_sf->uc.uc_link))
+			goto badframe;
 		if (save_user_regs(regs, frame, tm_frame, sigret, 1))
 			goto badframe;
 	}
 	regs->link = tramp;
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(regs->msr)) {
-		if (__put_user((unsigned long)&rt_sf->uc_transact,
-			       &rt_sf->uc.uc_link)
-		    || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs))
-			goto badframe;
-	}
-	else
-#endif
-		if (__put_user(0, &rt_sf->uc.uc_link))
-			goto badframe;
-
 	current->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
 
 	/* create a stack frame for the caller of the handler */
@@ -1056,13 +1058,6 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 	/* enter the signal handler in native-endian mode */
 	regs->msr &= ~MSR_LE;
 	regs->msr |= (MSR_KERNEL & MSR_LE);
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 	return 1;
 
 badframe:
@@ -1484,13 +1479,6 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	regs->nip = (unsigned long) ka->sa.sa_handler;
 	/* enter the signal handler in big-endian mode */
 	regs->msr &= ~MSR_LE;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 	return 1;
 
 badframe:
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 42991045349f..e35bf773df7a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -192,6 +192,13 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 
 	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
 
+	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
+	 * just indicates to userland that we were doing a transaction, but we
+	 * don't want to return in transactional state.  This also ensures
+	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+	 */
+	regs->msr &= ~MSR_TS_MASK;
+
 	flush_fp_to_thread(current);
 
 #ifdef CONFIG_ALTIVEC
@@ -749,13 +756,6 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
 
 	/* Make sure signal handler doesn't get spurious FP exceptions */
 	current->thread.fp_state.fpscr = 0;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 
 	/* Set up to return from userspace. */
 	if (vdso64_rt_sigtramp && current->mm->context.vdso_base) {
diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c
index e68fd1ae727a..7a37ecd3afa3 100644
--- a/arch/powerpc/kernel/smp-tbsync.c
+++ b/arch/powerpc/kernel/smp-tbsync.c
@@ -9,7 +9,6 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/unistd.h>
-#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <asm/smp.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a3b64f3bf9a2..ac2621af3154 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -369,13 +369,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
 	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
-	if (smp_ops)
-		if (smp_ops->probe)
-			max_cpus = smp_ops->probe();
-		else
-			max_cpus = NR_CPUS;
-	else
-		max_cpus = 1;
+	if (smp_ops && smp_ops->probe)
+		smp_ops->probe();
 }
 
 void smp_prepare_boot_cpu(void)
@@ -580,7 +575,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 int cpu_to_core_id(int cpu)
 {
 	struct device_node *np;
-	const int *reg;
+	const __be32 *reg;
 	int id = -1;
 
 	np = of_get_cpu_node(cpu, NULL);
@@ -591,7 +586,7 @@ int cpu_to_core_id(int cpu)
 	if (!reg)
 		goto out;
 
-	id = *reg;
+	id = be32_to_cpup(reg);
 out:
 	of_node_put(np);
 	return id;
diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S
index 0f204053e5b5..553c1405ee05 100644
--- a/arch/powerpc/kernel/swsusp_booke.S
+++ b/arch/powerpc/kernel/swsusp_booke.S
@@ -74,21 +74,21 @@ _GLOBAL(swsusp_arch_suspend)
 	bne	1b
 
 	/* Save SPRGs */
-	mfsprg	r4,0
+	mfspr	r4,SPRN_SPRG0
 	stw	r4,SL_SPRG0(r11)
-	mfsprg	r4,1
+	mfspr	r4,SPRN_SPRG1
 	stw	r4,SL_SPRG1(r11)
-	mfsprg	r4,2
+	mfspr	r4,SPRN_SPRG2
 	stw	r4,SL_SPRG2(r11)
-	mfsprg	r4,3
+	mfspr	r4,SPRN_SPRG3
 	stw	r4,SL_SPRG3(r11)
-	mfsprg	r4,4
+	mfspr	r4,SPRN_SPRG4
 	stw	r4,SL_SPRG4(r11)
-	mfsprg	r4,5
+	mfspr	r4,SPRN_SPRG5
 	stw	r4,SL_SPRG5(r11)
-	mfsprg	r4,6
+	mfspr	r4,SPRN_SPRG6
 	stw	r4,SL_SPRG6(r11)
-	mfsprg	r4,7
+	mfspr	r4,SPRN_SPRG7
 	stw	r4,SL_SPRG7(r11)
 
 	/* Call the low level suspend stuff (we should probably have made
@@ -150,21 +150,21 @@ _GLOBAL(swsusp_arch_resume)
 	bl	_tlbil_all
 
 	lwz	r4,SL_SPRG0(r11)
-	mtsprg	0,r4
+	mtspr	SPRN_SPRG0,r4
 	lwz	r4,SL_SPRG1(r11)
-	mtsprg	1,r4
+	mtspr	SPRN_SPRG1,r4
 	lwz	r4,SL_SPRG2(r11)
-	mtsprg	2,r4
+	mtspr	SPRN_SPRG2,r4
 	lwz	r4,SL_SPRG3(r11)
-	mtsprg	3,r4
+	mtspr	SPRN_SPRG3,r4
 	lwz	r4,SL_SPRG4(r11)
-	mtsprg	4,r4
+	mtspr	SPRN_SPRG4,r4
 	lwz	r4,SL_SPRG5(r11)
-	mtsprg	5,r4
+	mtspr	SPRN_SPRG5,r4
 	lwz	r4,SL_SPRG6(r11)
-	mtsprg	6,r4
+	mtspr	SPRN_SPRG6,r4
 	lwz	r4,SL_SPRG7(r11)
-	mtsprg	7,r4
+	mtspr	SPRN_SPRG7,r4
 
 	/* restore the MSR */
 	lwz	r3,SL_MSR(r11)
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index 4e3cc47f26b9..cd9be9aa016d 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -34,7 +34,6 @@
 #include <linux/ipc.h>
 #include <linux/utsname.h>
 #include <linux/file.h>
-#include <linux/init.h>
 #include <linux/personality.h>
 
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index b4e667663d9b..97e1dc917683 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -51,8 +51,6 @@ static ssize_t store_smt_snooze_delay(struct device *dev,
 		return -EINVAL;
 
 	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
-	update_smt_snooze_delay(cpu->dev.id, snooze);
-
 	return count;
 }
 
@@ -86,6 +84,304 @@ __setup("smt-snooze-delay=", setup_smt_snooze_delay);
 
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define MAX_BIT				63
+
+static u64 pw20_wt;
+static u64 altivec_idle_wt;
+
+static unsigned int get_idle_ticks_bit(u64 ns)
+{
+	u64 cycle;
+
+	if (ns >= 10000)
+		cycle = div_u64(ns + 500, 1000) * tb_ticks_per_usec;
+	else
+		cycle = div_u64(ns * tb_ticks_per_usec, 1000);
+
+	if (!cycle)
+		return 0;
+
+	return ilog2(cycle);
+}
+
+static void do_show_pwrmgtcr0(void *val)
+{
+	u32 *value = val;
+
+	*value = mfspr(SPRN_PWRMGTCR0);
+}
+
+static ssize_t show_pw20_state(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+	value &= PWRMGTCR0_PW20_WAIT;
+
+	return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_pw20_state(void *val)
+{
+	u32 *value = val;
+	u32 pw20_state;
+
+	pw20_state = mfspr(SPRN_PWRMGTCR0);
+
+	if (*value)
+		pw20_state |= PWRMGTCR0_PW20_WAIT;
+	else
+		pw20_state &= ~PWRMGTCR0_PW20_WAIT;
+
+	mtspr(SPRN_PWRMGTCR0, pw20_state);
+}
+
+static ssize_t store_pw20_state(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	if (kstrtou32(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > 1)
+		return -EINVAL;
+
+	smp_call_function_single(cpu, do_store_pw20_state, &value, 1);
+
+	return count;
+}
+
+static ssize_t show_pw20_wait_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	u64 tb_cycle = 1;
+	u64 time;
+
+	unsigned int cpu = dev->id;
+
+	if (!pw20_wt) {
+		smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+		value = (value & PWRMGTCR0_PW20_ENT) >>
+					PWRMGTCR0_PW20_ENT_SHIFT;
+
+		tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+		/* convert ms to ns */
+		if (tb_ticks_per_usec > 1000) {
+			time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+		} else {
+			u32 rem_us;
+
+			time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+						&rem_us);
+			time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+		}
+	} else {
+		time = pw20_wt;
+	}
+
+	return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_pw20_wait_entry_bit(void *val)
+{
+	u32 *value = val;
+	u32 pw20_idle;
+
+	pw20_idle = mfspr(SPRN_PWRMGTCR0);
+
+	/* Set Automatic PW20 Core Idle Count */
+	/* clear count */
+	pw20_idle &= ~PWRMGTCR0_PW20_ENT;
+
+	/* set count */
+	pw20_idle |= ((MAX_BIT - *value) << PWRMGTCR0_PW20_ENT_SHIFT);
+
+	mtspr(SPRN_PWRMGTCR0, pw20_idle);
+}
+
+static ssize_t store_pw20_wait_time(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 entry_bit;
+	u64 value;
+
+	unsigned int cpu = dev->id;
+
+	if (kstrtou64(buf, 0, &value))
+		return -EINVAL;
+
+	if (!value)
+		return -EINVAL;
+
+	entry_bit = get_idle_ticks_bit(value);
+	if (entry_bit > MAX_BIT)
+		return -EINVAL;
+
+	pw20_wt = value;
+
+	smp_call_function_single(cpu, set_pw20_wait_entry_bit,
+				&entry_bit, 1);
+
+	return count;
+}
+
+static ssize_t show_altivec_idle(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+	value &= PWRMGTCR0_AV_IDLE_PD_EN;
+
+	return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_altivec_idle(void *val)
+{
+	u32 *value = val;
+	u32 altivec_idle;
+
+	altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+	if (*value)
+		altivec_idle |= PWRMGTCR0_AV_IDLE_PD_EN;
+	else
+		altivec_idle &= ~PWRMGTCR0_AV_IDLE_PD_EN;
+
+	mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	if (kstrtou32(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > 1)
+		return -EINVAL;
+
+	smp_call_function_single(cpu, do_store_altivec_idle, &value, 1);
+
+	return count;
+}
+
+static ssize_t show_altivec_idle_wait_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	u64 tb_cycle = 1;
+	u64 time;
+
+	unsigned int cpu = dev->id;
+
+	if (!altivec_idle_wt) {
+		smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+		value = (value & PWRMGTCR0_AV_IDLE_CNT) >>
+					PWRMGTCR0_AV_IDLE_CNT_SHIFT;
+
+		tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+		/* convert ms to ns */
+		if (tb_ticks_per_usec > 1000) {
+			time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+		} else {
+			u32 rem_us;
+
+			time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+						&rem_us);
+			time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+		}
+	} else {
+		time = altivec_idle_wt;
+	}
+
+	return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_altivec_idle_wait_entry_bit(void *val)
+{
+	u32 *value = val;
+	u32 altivec_idle;
+
+	altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+	/* Set Automatic AltiVec Idle Count */
+	/* clear count */
+	altivec_idle &= ~PWRMGTCR0_AV_IDLE_CNT;
+
+	/* set count */
+	altivec_idle |= ((MAX_BIT - *value) << PWRMGTCR0_AV_IDLE_CNT_SHIFT);
+
+	mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle_wait_time(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 entry_bit;
+	u64 value;
+
+	unsigned int cpu = dev->id;
+
+	if (kstrtou64(buf, 0, &value))
+		return -EINVAL;
+
+	if (!value)
+		return -EINVAL;
+
+	entry_bit = get_idle_ticks_bit(value);
+	if (entry_bit > MAX_BIT)
+		return -EINVAL;
+
+	altivec_idle_wt = value;
+
+	smp_call_function_single(cpu, set_altivec_idle_wait_entry_bit,
+				&entry_bit, 1);
+
+	return count;
+}
+
+/*
+ * Enable/Disable interface:
+ * 0, disable. 1, enable.
+ */
+static DEVICE_ATTR(pw20_state, 0600, show_pw20_state, store_pw20_state);
+static DEVICE_ATTR(altivec_idle, 0600, show_altivec_idle, store_altivec_idle);
+
+/*
+ * Set wait time interface:(Nanosecond)
+ * Example: Base on TBfreq is 41MHZ.
+ * 1~48(ns): TB[63]
+ * 49~97(ns): TB[62]
+ * 98~195(ns): TB[61]
+ * 196~390(ns): TB[60]
+ * 391~780(ns): TB[59]
+ * 781~1560(ns): TB[58]
+ * ...
+ */
+static DEVICE_ATTR(pw20_wait_time, 0600,
+			show_pw20_wait_time,
+			store_pw20_wait_time);
+static DEVICE_ATTR(altivec_idle_wait_time, 0600,
+			show_altivec_idle_wait_time,
+			store_altivec_idle_wait_time);
+#endif
+
 /*
  * Enabling PMCs will slow partition context switch times so we only do
  * it the first time we write to the PMCs.
@@ -108,14 +404,14 @@ void ppc_enable_pmcs(void)
 }
 EXPORT_SYMBOL(ppc_enable_pmcs);
 
-#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+#define __SYSFS_SPRSETUP(NAME, ADDRESS, EXTRA) \
 static void read_##NAME(void *val) \
 { \
 	*(unsigned long *)val = mfspr(ADDRESS);	\
 } \
 static void write_##NAME(void *val) \
 { \
-	ppc_enable_pmcs(); \
+	EXTRA; \
 	mtspr(ADDRESS, *(unsigned long *)val);	\
 } \
 static ssize_t show_##NAME(struct device *dev, \
@@ -140,6 +436,10 @@ static ssize_t __used \
 	return count; \
 }
 
+#define SYSFS_PMCSETUP(NAME, ADDRESS)	\
+	__SYSFS_SPRSETUP(NAME, ADDRESS, ppc_enable_pmcs())
+#define SYSFS_SPRSETUP(NAME, ADDRESS)	\
+	__SYSFS_SPRSETUP(NAME, ADDRESS, )
 
 /* Let's define all possible registers, we'll only hook up the ones
  * that are implemented on the current processor
@@ -175,10 +475,10 @@ SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
 SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
 
 SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
-SYSFS_PMCSETUP(purr, SPRN_PURR);
-SYSFS_PMCSETUP(spurr, SPRN_SPURR);
-SYSFS_PMCSETUP(dscr, SPRN_DSCR);
-SYSFS_PMCSETUP(pir, SPRN_PIR);
+SYSFS_SPRSETUP(purr, SPRN_PURR);
+SYSFS_SPRSETUP(spurr, SPRN_SPURR);
+SYSFS_SPRSETUP(dscr, SPRN_DSCR);
+SYSFS_SPRSETUP(pir, SPRN_PIR);
 
 /*
   Lets only enable read for phyp resources and
@@ -249,34 +549,34 @@ SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
 SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
 SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
 #ifdef CONFIG_DEBUG_KERNEL
-SYSFS_PMCSETUP(hid0, SPRN_HID0);
-SYSFS_PMCSETUP(hid1, SPRN_HID1);
-SYSFS_PMCSETUP(hid4, SPRN_HID4);
-SYSFS_PMCSETUP(hid5, SPRN_HID5);
-SYSFS_PMCSETUP(ima0, SPRN_PA6T_IMA0);
-SYSFS_PMCSETUP(ima1, SPRN_PA6T_IMA1);
-SYSFS_PMCSETUP(ima2, SPRN_PA6T_IMA2);
-SYSFS_PMCSETUP(ima3, SPRN_PA6T_IMA3);
-SYSFS_PMCSETUP(ima4, SPRN_PA6T_IMA4);
-SYSFS_PMCSETUP(ima5, SPRN_PA6T_IMA5);
-SYSFS_PMCSETUP(ima6, SPRN_PA6T_IMA6);
-SYSFS_PMCSETUP(ima7, SPRN_PA6T_IMA7);
-SYSFS_PMCSETUP(ima8, SPRN_PA6T_IMA8);
-SYSFS_PMCSETUP(ima9, SPRN_PA6T_IMA9);
-SYSFS_PMCSETUP(imaat, SPRN_PA6T_IMAAT);
-SYSFS_PMCSETUP(btcr, SPRN_PA6T_BTCR);
-SYSFS_PMCSETUP(pccr, SPRN_PA6T_PCCR);
-SYSFS_PMCSETUP(rpccr, SPRN_PA6T_RPCCR);
-SYSFS_PMCSETUP(der, SPRN_PA6T_DER);
-SYSFS_PMCSETUP(mer, SPRN_PA6T_MER);
-SYSFS_PMCSETUP(ber, SPRN_PA6T_BER);
-SYSFS_PMCSETUP(ier, SPRN_PA6T_IER);
-SYSFS_PMCSETUP(sier, SPRN_PA6T_SIER);
-SYSFS_PMCSETUP(siar, SPRN_PA6T_SIAR);
-SYSFS_PMCSETUP(tsr0, SPRN_PA6T_TSR0);
-SYSFS_PMCSETUP(tsr1, SPRN_PA6T_TSR1);
-SYSFS_PMCSETUP(tsr2, SPRN_PA6T_TSR2);
-SYSFS_PMCSETUP(tsr3, SPRN_PA6T_TSR3);
+SYSFS_SPRSETUP(hid0, SPRN_HID0);
+SYSFS_SPRSETUP(hid1, SPRN_HID1);
+SYSFS_SPRSETUP(hid4, SPRN_HID4);
+SYSFS_SPRSETUP(hid5, SPRN_HID5);
+SYSFS_SPRSETUP(ima0, SPRN_PA6T_IMA0);
+SYSFS_SPRSETUP(ima1, SPRN_PA6T_IMA1);
+SYSFS_SPRSETUP(ima2, SPRN_PA6T_IMA2);
+SYSFS_SPRSETUP(ima3, SPRN_PA6T_IMA3);
+SYSFS_SPRSETUP(ima4, SPRN_PA6T_IMA4);
+SYSFS_SPRSETUP(ima5, SPRN_PA6T_IMA5);
+SYSFS_SPRSETUP(ima6, SPRN_PA6T_IMA6);
+SYSFS_SPRSETUP(ima7, SPRN_PA6T_IMA7);
+SYSFS_SPRSETUP(ima8, SPRN_PA6T_IMA8);
+SYSFS_SPRSETUP(ima9, SPRN_PA6T_IMA9);
+SYSFS_SPRSETUP(imaat, SPRN_PA6T_IMAAT);
+SYSFS_SPRSETUP(btcr, SPRN_PA6T_BTCR);
+SYSFS_SPRSETUP(pccr, SPRN_PA6T_PCCR);
+SYSFS_SPRSETUP(rpccr, SPRN_PA6T_RPCCR);
+SYSFS_SPRSETUP(der, SPRN_PA6T_DER);
+SYSFS_SPRSETUP(mer, SPRN_PA6T_MER);
+SYSFS_SPRSETUP(ber, SPRN_PA6T_BER);
+SYSFS_SPRSETUP(ier, SPRN_PA6T_IER);
+SYSFS_SPRSETUP(sier, SPRN_PA6T_SIER);
+SYSFS_SPRSETUP(siar, SPRN_PA6T_SIAR);
+SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
+SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
+SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
+SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
 #endif /* CONFIG_DEBUG_KERNEL */
 #endif /* HAS_PPC_PMC_PA6T */
 
@@ -421,6 +721,15 @@ static void register_cpu_online(unsigned int cpu)
 		device_create_file(s, &dev_attr_pir);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+		device_create_file(s, &dev_attr_pw20_state);
+		device_create_file(s, &dev_attr_pw20_wait_time);
+
+		device_create_file(s, &dev_attr_altivec_idle);
+		device_create_file(s, &dev_attr_altivec_idle_wait_time);
+	}
+#endif
 	cacheinfo_cpu_online(cpu);
 }
 
@@ -493,6 +802,15 @@ static void unregister_cpu_online(unsigned int cpu)
 		device_remove_file(s, &dev_attr_pir);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+		device_remove_file(s, &dev_attr_pw20_state);
+		device_remove_file(s, &dev_attr_pw20_wait_time);
+
+		device_remove_file(s, &dev_attr_altivec_idle);
+		device_remove_file(s, &dev_attr_altivec_idle_wait_time);
+	}
+#endif
 	cacheinfo_cpu_offline(cpu);
 }
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b3b144121cc9..b3dab20acf34 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -510,7 +510,6 @@ void timer_interrupt(struct pt_regs * regs)
 	 */
 	may_hard_irq_enable();
 
-	__get_cpu_var(irq_stat).timer_irqs++;
 
 #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
@@ -532,10 +531,15 @@ void timer_interrupt(struct pt_regs * regs)
 		*next_tb = ~(u64)0;
 		if (evt->event_handler)
 			evt->event_handler(evt);
+		__get_cpu_var(irq_stat).timer_irqs_event++;
 	} else {
 		now = *next_tb - now;
 		if (now <= DECREMENTER_MAX)
 			set_dec((int)now);
+		/* We may have raced with new irq work */
+		if (test_irq_work_pending())
+			set_dec(1);
+		__get_cpu_var(irq_stat).timer_irqs_others++;
 	}
 
 #ifdef CONFIG_PPC64
@@ -801,8 +805,16 @@ static void __init clocksource_init(void)
 static int decrementer_set_next_event(unsigned long evt,
 				      struct clock_event_device *dev)
 {
+	/* Don't adjust the decrementer if some irq work is pending */
+	if (test_irq_work_pending())
+		return 0;
 	__get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt;
 	set_dec(evt);
+
+	/* We may have raced with new irq work */
+	if (test_irq_work_pending())
+		set_dec(1);
+
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 907a472f9a9e..33cd7a0b8e73 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -285,6 +285,21 @@ void system_reset_exception(struct pt_regs *regs)
 
 	/* What should we do here? We could issue a shutdown or hard reset. */
 }
+
+/*
+ * This function is called in real mode. Strictly no printk's please.
+ *
+ * regs->nip and regs->msr contains srr0 and ssr1.
+ */
+long machine_check_early(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+		handled = cur_cpu_spec->machine_check_early(regs);
+	return handled;
+}
+
 #endif
 
 /*
@@ -1384,7 +1399,6 @@ void fp_unavailable_tm(struct pt_regs *regs)
 
 	TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
 		 regs->nip, regs->msr);
-	tm_enable();
 
         /* We can only have got here if the task started using FP after
          * beginning the transaction.  So, the transactional regs are just a
@@ -1393,8 +1407,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
          * transaction, and probably retry but now with FP enabled.  So the
          * checkpointed FP registers need to be loaded.
 	 */
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	/* Reclaim didn't save out any FPRs to transact_fprs. */
 
 	/* Enable FP for the task: */
@@ -1403,11 +1416,19 @@ void fp_unavailable_tm(struct pt_regs *regs)
 	/* This loads and recheckpoints the FP registers from
 	 * thread.fpr[].  They will remain in registers after the
 	 * checkpoint so we don't need to reload them after.
+	 * If VMX is in use, the VRs now hold checkpointed values,
+	 * so we don't want to load the VRs from the thread_struct.
 	 */
-	tm_recheckpoint(&current->thread, regs->msr);
+	tm_recheckpoint(&current->thread, MSR_FP);
+
+	/* If VMX is in use, get the transactional values back */
+	if (regs->msr & MSR_VEC) {
+		do_load_up_transact_altivec(&current->thread);
+		/* At this point all the VSX state is loaded, so enable it */
+		regs->msr |= MSR_VSX;
+	}
 }
 
-#ifdef CONFIG_ALTIVEC
 void altivec_unavailable_tm(struct pt_regs *regs)
 {
 	/* See the comments in fp_unavailable_tm().  This function operates
@@ -1417,18 +1438,21 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 	TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx,"
 		 "MSR=%lx\n",
 		 regs->nip, regs->msr);
-	tm_enable();
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	regs->msr |= MSR_VEC;
-	tm_recheckpoint(&current->thread, regs->msr);
+	tm_recheckpoint(&current->thread, MSR_VEC);
 	current->thread.used_vr = 1;
+
+	if (regs->msr & MSR_FP) {
+		do_load_up_transact_fpu(&current->thread);
+		regs->msr |= MSR_VSX;
+	}
 }
-#endif
 
-#ifdef CONFIG_VSX
 void vsx_unavailable_tm(struct pt_regs *regs)
 {
+	unsigned long orig_msr = regs->msr;
+
 	/* See the comments in fp_unavailable_tm().  This works similarly,
 	 * though we're loading both FP and VEC registers in here.
 	 *
@@ -1440,18 +1464,30 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 		 "MSR=%lx\n",
 		 regs->nip, regs->msr);
 
-	tm_enable();
+	current->thread.used_vsr = 1;
+
+	/* If FP and VMX are already loaded, we have all the state we need */
+	if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) {
+		regs->msr |= MSR_VSX;
+		return;
+	}
+
 	/* This reclaims FP and/or VR regs if they're already enabled */
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 
 	regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode |
 		MSR_VSX;
-	/* This loads & recheckpoints FP and VRs. */
-	tm_recheckpoint(&current->thread, regs->msr);
-	current->thread.used_vsr = 1;
+
+	/* This loads & recheckpoints FP and VRs; but we have
+	 * to be sure not to overwrite previously-valid state.
+	 */
+	tm_recheckpoint(&current->thread, regs->msr & ~orig_msr);
+
+	if (orig_msr & MSR_FP)
+		do_load_up_transact_fpu(&current->thread);
+	if (orig_msr & MSR_VEC)
+		do_load_up_transact_altivec(&current->thread);
 }
-#endif
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 void performance_monitor_exception(struct pt_regs *regs)
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
index 59f419b935f2..003b20964ea0 100644
--- a/arch/powerpc/kernel/uprobes.c
+++ b/arch/powerpc/kernel/uprobes.c
@@ -186,7 +186,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	 * emulate_step() returns 1 if the insn was successfully emulated.
 	 * For all other cases, we need to single-step in hardware.
 	 */
-	ret = emulate_step(regs, auprobe->ainsn);
+	ret = emulate_step(regs, auprobe->insn);
 	if (ret > 0)
 		return true;
 
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 6e8f507ed32b..79683d0393f5 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/page.h>
 
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index b8553d62b792..8df9e2463007 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/page.h>
 
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 0458a9aaba9d..74f8050518d6 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -37,6 +37,16 @@ _GLOBAL(do_load_up_transact_altivec)
 #endif
 
 /*
+ * Enable use of VMX/Altivec for the caller.
+ */
+_GLOBAL(vec_enable)
+	mfmsr	r3
+	oris	r3,r3,MSR_VEC@h
+	MTMSRD(r3)
+	isync
+	blr
+
+/*
  * Load state from memory into VMX registers including VSCR.
  * Assumes the caller has enabled VMX in the MSR.
  */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 76a64821f4a2..826d8bd9e522 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -518,16 +518,18 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
                                          struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	dma_addr_t ret = DMA_ERROR_CODE;
 
-	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+	tbl = get_iommu_table_base(dev);
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
 		atomic_inc(&viodev->cmo.allocs_failed);
 		return ret;
 	}
 
 	ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
 	if (unlikely(dma_mapping_error(dev, ret))) {
-		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 		atomic_inc(&viodev->cmo.allocs_failed);
 	}
 
@@ -540,10 +542,12 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
 				     struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 
+	tbl = get_iommu_table_base(dev);
 	dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
 
-	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 }
 
 static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -551,12 +555,14 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                                 struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	struct scatterlist *sgl;
 	int ret, count = 0;
 	size_t alloc_size = 0;
 
+	tbl = get_iommu_table_base(dev);
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
 	if (vio_cmo_alloc(viodev, alloc_size)) {
 		atomic_inc(&viodev->cmo.allocs_failed);
@@ -572,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 	}
 
 	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
-		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 	if (alloc_size)
 		vio_cmo_dealloc(viodev, alloc_size);
 
@@ -585,12 +591,14 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
 		struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	struct scatterlist *sgl;
 	size_t alloc_size = 0;
 	int count = 0;
 
+	tbl = get_iommu_table_base(dev);
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 
 	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
 
@@ -706,11 +714,14 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
 {
 	struct vio_cmo_dev_entry *dev_ent;
 	struct device *dev = &viodev->dev;
+	struct iommu_table *tbl;
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
 	unsigned long flags;
 	size_t size;
 	bool dma_capable = false;
 
+	tbl = get_iommu_table_base(dev);
+
 	/* A device requires entitlement if it has a DMA window property */
 	switch (viodev->family) {
 	case VDEVICE:
@@ -736,7 +747,8 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
 			return -EINVAL;
 		}
 
-		viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+		viodev->cmo.desired =
+			IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
 		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
 			viodev->cmo.desired = VIO_CMO_MIN_ENT;
 		size = VIO_CMO_MIN_ENT;
@@ -1176,9 +1188,10 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 			    &tbl->it_index, &offset, &size);
 
 	/* TCE table size - measured in tce entries */
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->it_size = size >> tbl->it_page_shift;
 	/* offset for VIO should always be 0 */
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = offset >> tbl->it_page_shift;
 	tbl->it_busno = 0;
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;