From c97102ba96324da330078ad8619ba4dfe840dbe3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Dec 2013 17:08:31 -0800 Subject: kexec: migrate to reboot cpu Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") moved reboot= handling to generic code. In the process it also removed the code in native_machine_shutdown() which are moving reboot process to reboot_cpu/cpu0. I guess that thought must have been that all reboot paths are calling migrate_to_reboot_cpu(), so we don't need this special handling. But kexec reboot path (kernel_kexec()) is not calling migrate_to_reboot_cpu() so above change broke kexec. Now reboot can happen on non-boot cpu and when INIT is sent in second kerneo to bring up BP, it brings down the machine. So start calling migrate_to_reboot_cpu() in kexec reboot path to avoid this problem. Bisected by WANG Chao. Reported-by: Matthew Whitehead Reported-by: Dave Young Signed-off-by: Vivek Goyal Tested-by: Baoquan He Tested-by: WANG Chao Acked-by: H. Peter Anvin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 8e00f9f6f963..9e7db9e73cc1 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *); * Architecture-specific implementations of sys_reboot commands. */ +extern void migrate_to_reboot_cpu(void); extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); -- cgit v1.2.3 From de466bd628e8d663fdf3f791bc8db318ee85c714 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:42 -0800 Subject: mm: numa: avoid unnecessary disruption of NUMA hinting during migration do_huge_pmd_numa_page() handles the case where there is parallel THP migration. However, by the time it is checked the NUMA hinting information has already been disrupted. This patch adds an earlier check with some helpers. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++++++++ mm/huge_memory.c | 22 ++++++++++++++++------ mm/migrate.c | 12 ++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f5096b58b20d..b7717d74da7f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING +extern bool pmd_trans_migrating(pmd_t pmd); +extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); #else +static inline bool pmd_trans_migrating(pmd_t pmd) +{ + return false; +} +static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ +} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 70e7429fd8ea..7de1bf85f683 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = 0; goto out_unlock; } + + /* mmap_sem prevents this happening but warn if that changes */ + WARN_ON(pmd_trans_migrating(pmd)); + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ spin_unlock(src_ptl); @@ -1299,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); @@ -1329,12 +1344,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } - /* - * If there are potential migrations, wait for completion and retry. We - * do not relock and check_same as the page may no longer be mapped. - * Furtermore, even if the page is currently misplaced, there is no - * guarantee it is still misplaced after the migration completes. - */ + /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { spin_unlock(ptl); wait_on_page_locked(page); diff --git a/mm/migrate.c b/mm/migrate.c index a987525810ae..cfb419085261 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1655,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + wait_on_page_locked(page); +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on -- cgit v1.2.3 From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 18 Dec 2013 17:08:44 -0800 Subject: mm: fix TLB flush race between migration, and change_protection_range There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable.h | 11 ++++++++-- include/asm-generic/pgtable.h | 2 +- include/linux/mm_types.h | 44 +++++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + mm/huge_memory.c | 7 ++++++ mm/mprotect.c | 2 ++ mm/pgtable-generic.c | 5 +++-- 8 files changed, 69 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 8358dc144959..0f9e94537eee 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte) } #define pte_accessible pte_accessible -static inline unsigned long pte_accessible(pte_t a) +static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) { return pte_val(a) & _PAGE_VALID; } @@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U * and SUN4V pte layout, so this inline test is fine. */ - if (likely(mm != &init_mm) && pte_accessible(orig)) + if (likely(mm != &init_mm) && pte_accessible(mm, orig)) tlb_batch_add(mm, addr, ptep, orig, fullmm); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3d1999458709..bbc8b12fa443 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -452,9 +452,16 @@ static inline int pte_present(pte_t a) } #define pte_accessible pte_accessible -static inline int pte_accessible(pte_t a) +static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { - return pte_flags(a) & _PAGE_PRESENT; + if (pte_flags(a) & _PAGE_PRESENT) + return true; + + if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + mm_tlb_flush_pending(mm)) + return true; + + return false; } static inline int pte_hidden(pte_t pte) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f330d28e4d0e..b12079afbd5f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #endif #ifndef pte_accessible -# define pte_accessible(pte) ((void)(pte),1) +# define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bd299418a934..e5c49c30460f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -442,6 +442,14 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + /* + * An operation with batched TLB flushing is going on. Anything that + * can move process memory needs to flush the TLB when moving a + * PROT_NONE or PROT_NUMA mapped page. + */ + bool tlb_flush_pending; #endif struct uprobes_state uprobes_state; }; @@ -459,4 +467,40 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return mm->cpu_vm_mask_var; } +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) +/* + * Memory barriers to keep this state in sync are graciously provided by + * the page table locks, outside of which no page table modifications happen. + * The barriers below prevent the compiler from re-ordering the instructions + * around the memory barriers that are already present in the code. + */ +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + return mm->tlb_flush_pending; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ + mm->tlb_flush_pending = true; + barrier(); +} +/* Clearing is done after a TLB flush, which also provides a barrier. */ +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + mm->tlb_flush_pending = false; +} +#else +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + return false; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ +} +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ +} +#endif + #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be9548c..5721f0e3f2da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); mm_init_owner(mm, p); + clear_tlb_flush_pending(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..3d2783e10596 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,6 +1376,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } + /* + * The page_table_lock above provides a memory barrier + * with change_protection_range. + */ + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. diff --git a/mm/mprotect.c b/mm/mprotect.c index f8421722acb9..bb53a6591aea 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -188,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -199,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); return pages; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e84cad27a801..a8b919925934 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + struct mm_struct *mm = (vma)->vm_mm; pte_t pte; - pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - if (pte_accessible(pte)) + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } -- cgit v1.2.3 From af2c1401e6f9177483be4fad876d0073669df9df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:45 -0800 Subject: mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates According to documentation on barriers, stores issued before a LOCK can complete after the lock implying that it's possible tlb_flush_pending can be visible after a page table update. As per revised documentation, this patch adds a smp_mb__before_spinlock to guarantee the correct ordering. Signed-off-by: Mel Gorman Acked-by: Paul E. McKenney Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e5c49c30460f..ad0616f2fe2c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -482,7 +482,12 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) static inline void set_tlb_flush_pending(struct mm_struct *mm) { mm->tlb_flush_pending = true; - barrier(); + + /* + * Guarantee that the tlb_flush_pending store does not leak into the + * critical section updating the page tables + */ + smp_mb__before_spinlock(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void clear_tlb_flush_pending(struct mm_struct *mm) -- cgit v1.2.3