From 89ed0b769d6adf30364f60e6b1566961821a9893 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sun, 9 Oct 2022 20:41:25 -0700 Subject: powerpc/pseries/vas: Add VAS IRQ primary handler irq_default_primary_handler() can be used only with IRQF_ONESHOT flag, but the flag disables IRQ before executing the thread handler and enables it after the interrupt is handled. But this IRQ disable sets the VAS IRQ OFF state in the hypervisor. In case if NX faults during this window, the hypervisor will not deliver the fault interrupt to the partition and the user space may wait continuously for the CSB update. So use VAS specific IRQ handler instead of calling the default primary handler. Increment pending_faults counter in IRQ handler and the bottom thread handler will process all faults based on this counter. In case if the another interrupt is received while the thread is running, it will be processed using this counter. The synchronization of top and bottom handlers will be done with IRQTF_RUNTHREAD flag and will re-enter to bottom half if this flag is set. Signed-off-by: Haren Myneni Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aaad8813b4762a6753cfcd0b605a7574a5192ec7.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/vas.c | 40 +++++++++++++++++++++++++++++------- arch/powerpc/platforms/pseries/vas.h | 1 + 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 0e0524cbe20c..53b36381489e 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -200,16 +200,41 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) struct vas_user_win_ref *tsk_ref; int rc; - rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); - if (!rc) { - tsk_ref = &txwin->vas_win.task_ref; - vas_dump_crb(&crb); - vas_update_csb(&crb, tsk_ref); + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); } return IRQ_HANDLED; } +/* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + /* * Allocate window and setup IRQ mapping. */ @@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin, goto out_irq; } - rc = request_threaded_irq(txwin->fault_virq, NULL, - pseries_vas_fault_thread_fn, IRQF_ONESHOT, + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, txwin->name, txwin); if (rc) { pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 333ffa2f9f42..a2cb12a31c17 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -132,6 +132,7 @@ struct pseries_vas_window { u64 flags; char *name; int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); -- cgit v1.2.3 From 2147783d6bf0b7ca14c72a25527dc5135bd17f65 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Thu, 6 Oct 2022 22:29:59 -0700 Subject: powerpc/pseries: Use lparcfg to reconfig VAS windows for DLPAR CPU The hypervisor assigns VAS (Virtual Accelerator Switchboard) windows depends on cores configured in LPAR. The kernel uses OF reconfig notifier to reconfig VAS windows for DLPAR CPU event. In the case of shared CPU mode partition, the hypervisor assigns VAS windows depends on CPU entitled capacity, not based on vcpus. When the user changes CPU entitled capacity for the partition, drmgr uses /proc/ppc64/lparcfg interface to notify the kernel. This patch adds the following changes to update VAS resources for shared mode: - Call vas reconfig windows from lparcfg_write() - Ignore reconfig changes in the VAS notifier Signed-off-by: Haren Myneni [mpe: Rework error handling, report any errors as EIO] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/efa9c16e4a78dda4567a16f13dabfd73cb4674a2.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 11 ++++++++ arch/powerpc/platforms/pseries/vas.c | 43 +++++++++++++++++++++----------- arch/powerpc/platforms/pseries/vas.h | 5 ++++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 507dc0b5987d..63fd925ccbb8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -35,6 +35,7 @@ #include #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 53b36381489e..4ad6e510d405 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -852,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds) mutex_unlock(&vas_pseries_mutex); return rc; } + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + /* * Total number of default credits available (target_credits) * in LPAR depends on number of cores configured. It varies based on @@ -866,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int new_nr_creds, len, rc = 0; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -878,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, - vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (!rc) { - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, - new_nr_creds); - } - - if (rc) - pr_err("Failed reconfig VAS capabilities with DLPAR\n"); - - return rc; + return pseries_vas_dlpar_cpu(); } static struct notifier_block pseries_vas_nb = { diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index a2cb12a31c17..7115043ec488 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -141,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); #else static inline int vas_migration_handler(int action) { return 0; } +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} #endif #endif /* _VAS_H */ -- cgit v1.2.3 From be83d5485da549d934ec65463ea831709f2827b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:08 +1000 Subject: powerpc/64s: Add lockdep for HPTE lock Add lockdep annotation for the HPTE bit-spinlock. Modern systems don't take the tlbie lock, so this shows up some of the same lockdep warnings that were being reported by the ppc970. And they're not taken in exactly the same places so this is nice to have in its own right. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 42 ++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..7a640e610ea3 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -279,6 +304,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } + release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -327,6 +353,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -517,10 +544,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +608,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +791,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } -- cgit v1.2.3 From 35159b5717fa9c6031fdd6a2193c7a3dc717ce33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:09 +1000 Subject: powerpc/64s: make HPTE lock and native_tlbie_lock irq-safe With kfence enabled, there are several cases where HPTE and TLBIE locks are called from softirq context, for example: WARNING: inconsistent lock state 6.0.0-11845-g0cbbc95b12ac #1 Tainted: G N -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. swapper/0/1 [HC0[0]:SC0[0]:HE1:SE1] takes: c000000002734de8 (native_tlbie_lock){+.?.}-{2:2}, at: .native_hpte_updateboltedpp+0x1a4/0x600 {IN-SOFTIRQ-W} state was registered at: .lock_acquire+0x20c/0x520 ._raw_spin_lock+0x4c/0x70 .native_hpte_invalidate+0x62c/0x840 .hash__kernel_map_pages+0x450/0x640 .kfence_protect+0x58/0xc0 .kfence_guarded_free+0x374/0x5a0 .__slab_free+0x3d0/0x630 .put_cred_rcu+0xcc/0x120 .rcu_core+0x3c4/0x14e0 .__do_softirq+0x1dc/0x7dc .do_softirq_own_stack+0x40/0x60 Fix this by consistently disabling irqs while taking either of these locks. Don't just disable bh because several of the more common cases already disable irqs, so this just makes the locks always irq-safe. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 7a640e610ea3..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -268,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -288,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -304,7 +309,6 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hpte_v = hpte_old_to_new_v(hpte_v); } - release_hpte_lock(); hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -312,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -366,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -409,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -472,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -490,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -503,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -520,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } -- cgit v1.2.3 From b12eb279ff552bd67c167b0fe701ae602aa7311e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 09:07:10 +1000 Subject: powerpc/64s: make linear_map_hash_lock a raw spinlock This lock is taken while the raw kfence_freelist_lock is held, so it must also be a raw spinlock, as reported by lockdep when raw lock nesting checking is enabled. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013230710.1987253-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index df008edf7be0..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1981,7 +1981,7 @@ repeat: } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_SPINLOCK(linear_map_hash_lock); +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { @@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); if (!(linear_map_hash_slots[lmi] & 0x80)) { - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); return; } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; -- cgit v1.2.3 From b9ef323ea1682f9837bf63ba10c5e3750f71a20a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:45 +1000 Subject: powerpc/64s: Disable preemption in hash lazy mmu mode apply_to_page_range on kernel pages does not disable preemption, which is a requirement for hash's lazy mmu mode, which keeps track of the TLBs to flush with a per-cpu array. Reported-by: Guenter Roeck Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) -- cgit v1.2.3 From 2b2095f3a6b43ec36ff890febc588df1ec32e826 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:46 +1000 Subject: powerpc/64s: Fix hash__change_memory_range preemption warning stop_machine_cpuslocked takes a mutex so it must be called in a preemptible context, so it can't simply be fixed by disabling preemption. This is not a bug, because CPU hotplug is locked, so this processor will call in to the stop machine function. So raw_smp_processor_id() could be used. This leaves a small chance that this thread will be migrated to another CPU, so the master work would be done by a CPU from a different context. Better for test coverage to make that a common case by just having the first CPU to call in become the master. Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_pgtable.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 747492edb75a..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); -- cgit v1.2.3 From 00ff1eaac129a24516a3f6d75adfb9df1efb55dd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 01:16:47 +1000 Subject: powerpc: Fix reschedule bug in KUAP-unlocked user copy schedule must not be explicitly called while KUAP is unlocked, because the AMR register will not be saved across the context switch on 64s (preemption is allowed because that is driven by interrupts which do save the AMR). exit_vmx_usercopy() runs inside an unlocked user access region, and it calls preempt_enable() which will call schedule() if need_resched() was set while non-preemptible. This can cause tasks to run unprotected when the should not, and can cause the user copy to be improperly blocked when scheduling back to it. Fix this by avoiding the explicit resched for preempt kernels by generating an interrupt to reschedule the context if need_resched() got set. Reported-by: Samuel Holland Signed-off-by: Nicholas Piggin Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221013151647.1857994-3-npiggin@gmail.com --- arch/powerpc/lib/vmx-helper.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index f76a50291fd7..d491da8d1838 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -36,7 +36,17 @@ int exit_vmx_usercopy(void) { disable_kernel_altivec(); pagefault_enable(); - preempt_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); return 0; } -- cgit v1.2.3 From e59b3399fde5e173b026d4952b215043e77b4521 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:28 +1000 Subject: KVM: PPC: BookS PR-KVM and BookE do not support context tracking The context tracking code in PR-KVM and BookE implementations is not complete, and can cause host crashes if context tracking is enabled. Make these implementations depend on !CONTEXT_TRACKING_USER. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-2-npiggin@gmail.com --- arch/powerpc/kvm/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 61cdd782d3c5..a9f57dad6d91 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on !CONTEXT_TRACKING_USER select KVM select KVM_BOOK3S_32_HANDLER select KVM_BOOK3S_PR_POSSIBLE @@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV config KVM_BOOK3S_64_PR tristate "KVM support without using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CONTEXT_TRACKING_USER select KVM_BOOK3S_PR_POSSIBLE help Support running guest kernels in virtual machines on processors @@ -190,6 +192,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" depends on PPC_E500 && !PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select MMU_NOTIFIER @@ -205,6 +208,7 @@ config KVM_E500V2 config KVM_E500MC bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select KVM_BOOKE_HV -- cgit v1.2.3 From a073672eb09670540e95a2a4aa1c46f5da74159f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 14 Oct 2022 13:07:29 +1000 Subject: powerpc/64/interrupt: Prevent NMI PMI causing a dangerous warning NMI PMIs really should not return using the normal interrupt_return function. If such a PMI hits in code returning to user with the context switched to user mode, this warning can fire. This was enough to cause crashes when reproducing on 64s, because another perf interrupt would hit while reporting bug, and that would cause another bug, and so on until smashing the stack. Work around that particular crash for now by just disabling that context warning for PMIs. This is a hack and not a complete fix, there could be other such problems lurking in corners. But it does fix the known crash. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221014030729.2077151-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64e.S | 7 +++++++ arch/powerpc/kernel/interrupt.c | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 930e36099015..2f68fb2ee4fc 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -813,6 +813,13 @@ kernel_dbg_exc: EXCEPTION_COMMON(0x260) CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception b interrupt_return diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index f9db0a172401..7bc93367de68 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -374,10 +374,16 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* - * CT_WARN_ON comes here via program_check_exception, - * so avoid recursion. + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs to work around a problem caused by NMI + * PMIs incorrectly taking this interrupt return path, it's possible + * for this to hit after interrupt exit to user switches context to + * user. See also the comment in the performance monitor handler in + * exceptions-64e/s.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM) + if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); -- cgit v1.2.3 From dc398a084d459f065658855454e09f2778f8c5cc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 7 Oct 2022 00:04:11 +1000 Subject: powerpc/64s/interrupt: Perf NMI should not take normal exit path NMI interrupts should exit with EXCEPTION_RESTORE_REGS not with interrupt_return_srr, which is what the perf NMI handler currently does. This breaks if a PMI hits after interrupt_exit_user_prepare_main() has switched the context tracking to user mode, then the CT_WARN_ON() in interrupt_exit_kernel_prepare() fires because it returns to kernel with context set to user. This could possibly be solved by soft-disabling PMIs in the exit path, but that reduces our ability to profile that code. The warning could be removed, but it's potentially useful. All other NMIs and soft-NMIs return using EXCEPTION_RESTORE_REGS, so this makes perf interrupts consistent with that and seems like the best fix. Signed-off-by: Nicholas Piggin [mpe: Squash in fixups from Nick] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221006140413.126443-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 14 +++++++++++++- arch/powerpc/kernel/interrupt.c | 14 ++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..651c36b056bd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor addi r3,r1,STACK_FRAME_OVERHEAD - bl performance_monitor_exception + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl performance_monitor_exception_async b interrupt_return_srr +1: + bl performance_monitor_exception_nmi + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL /** * Interrupt 0xf20 - Vector Unavailable Interrupt. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 7bc93367de68..fc6631a80527 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -377,13 +377,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) * CT_WARN_ON comes here via program_check_exception, so avoid * recursion. * - * Skip the assertion on PMIs to work around a problem caused by NMI - * PMIs incorrectly taking this interrupt return path, it's possible - * for this to hit after interrupt exit to user switches context to - * user. See also the comment in the performance monitor handler in - * exceptions-64e/s.S + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); -- cgit v1.2.3 From 65722736c3baf29e02e964a09e85c9ef71c48e8d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 22 Oct 2022 15:22:07 +1000 Subject: powerpc/64s/interrupt: Fix clear of PACA_IRQS_HARD_DIS when returning to soft-masked context Commit a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") fixed the problem of pending irqs being cleared when clearing the HARD_DIS bit, but then it didn't clear the bit at all. This change clears HARD_DIS without affecting other bits in the mask. When an interrupt hits in a soft-masked section that has MSR[EE]=1, it can hard disable and set PACA_IRQS_HARD_DIS, which must be cleared when returning to the EE=1 caller (unless it was set due to a MUST_HARD_MASK interrupt becoming pending). Failure to clear this leaves the returned-to context running with MSR[EE]=1 and PACA_IRQS_HARD_DIS, which confuses irq assertions and could be dangerous for code that might test the flag. This was observed in a hash MMU kernel where a kernel hash fault hits in a local_irqs_disabled region that has EE=1. The hash fault also runs with EE=1, then as it returns, a decrementer hits in the restart section and the irq restart code hard-masks which sets the PACA_IRQ_HARD_DIS flag, which is not clear when the original context is returned to. Reported-by: Sachin Sant Fixes: a4cb3651a1743 ("powerpc/64s/interrupt: Fix lost interrupts when returning to soft-masked context") Signed-off-by: Nicholas Piggin Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20221022052207.471328-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 978a173eb339..a019ed6fc839 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) * Returning to soft-disabled context. * Check if a MUST_HARD_MASK interrupt has become pending, in which * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. */ ld r12,_MSR(r1) andi. r10,r12,MSR_EE beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() + - /* Must clear MSR_EE from _MSR */ +1: /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S li r10,0 /* Clear valid before changing _MSR */ -- cgit v1.2.3