mm: delay rmap removal until after TLB flushmmu_gather-race-fix

When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPUs could still be using the page through stale TLB entries until after the flush. However, we have removed the rmap entry for that page early, which means that functions like folio_mkclean() would end up not serializing with the page table lock because the page had already been made invisible to rmap. And that is a problem, because while the TLB entry exists, we could end up with the following situation: (a) one CPU could come in and clean it, never seeing our mapping of the page (b) another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This commit uses the same old TLB gather array that we use to delay the freeing of the page to also say 'remove from rmap after flush', so that we can keep the rmap entries alive until all TLB entries have been flushed. It might be worth noting that this means that the page_zap_pte_rmap() is now called outside the page table lock. That was never mutual exclusion (since the same page could be mapped under multiple different page tables), but it does mean that it needs to use the more careful version of dec_lruvec_page_state() that doesn't depend on being called in a non-preemptable context. NOTE! While the "possibly lost dirty data" sounds catastrophic, for this all to happen you need to have a user thread doing either madvise() with MADV_DONTNEED or a full re-mmap() of the area concurrently with another thread continuing to use said mapping. So arguably this is about user space doing crazy things, but from a VM consistency standpoint it's better if we track the dirty bit properly even when user space goes off the rails. Reported-by: Nadav Amit <nadav.amit@gmail.com> Link: https://lore.kernel.org/all/B88D3073-440A-41C7-95F4-895D3F657EF2@gmail.com/ Cc: Will Deacon <will@kernel.org> Cc: Aneesh Kumar <aneesh.kumar@linux.ibm.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Nick Piggin <npiggin@gmail.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Sven Schnelle <svens@linux.ibm.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> # s390 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2022-10-29 11:45:07 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2022-11-02 16:10:30 -0700
commit: 28154ddc676efa64e8e792389787eb85199d2772 (patch)
tree: 6bf1a0ba9189c8e904a91b71e01ec5e44651c8a3 /include/asm-generic/tlb.h
parent: 655d4bdee63563392b0e5fb40f973c6d41658070 (diff)
download: linux-28154ddc676efa64e8e792389787eb85199d2772.tar.bz2
1 files changed, 33 insertions, 6 deletions
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 492dce43236e..b6045149d0b9 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -73,7 +73,8 @@
  *    __tlb_remove_page_size() is the basic primitive that queues a page for
  *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
  *    boolean indicating if the queue is (now) full and a call to
- *    tlb_flush_mmu() is required.
+ *    tlb_flush_mmu() is required. They take a 'flags' parameter that
+ *    states whether the rmap of the page should be removed after TLB flush.
  *
  *    tlb_remove_page() and tlb_remove_page_size() imply the call to
  *    tlb_flush_mmu() when required and has no return value.
@@ -187,6 +188,7 @@
  *  This is useful if your architecture already flushes TLB entries in the
  *  various ptep_get_and_clear() functions.
  */
+#define TLB_ZAP_RMAP 1ul
 
 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 
@@ -238,11 +240,36 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
  */
 #define MMU_GATHER_BUNDLE	8
 
+/*
+ * Fake type for an encoded page with flag bits in the low bits.
+ *
+ * Right now just one bit, but we could have more depending on the
+ * alignment of 'struct page'.
+ */
+struct encoded_page;
+#define ENCODE_PAGE_BITS (TLB_ZAP_RMAP)
+
+static inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
+{
+	flags &= ENCODE_PAGE_BITS;
+	return (struct encoded_page *)(flags | (unsigned long)page);
+}
+
+static inline bool encoded_page_flags(struct encoded_page *page)
+{
+	return ENCODE_PAGE_BITS & (unsigned long)page;
+}
+
+static inline struct page *encoded_page_ptr(struct encoded_page *page)
+{
+	return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
+}
+
 struct mmu_gather_batch {
 	struct mmu_gather_batch	*next;
 	unsigned int		nr;
 	unsigned int		max;
-	struct page		*pages[];
+	struct encoded_page	*encoded_pages[];
 };
 
 #define MAX_GATHER_BATCH	\
@@ -257,7 +284,7 @@ struct mmu_gather_batch {
 #define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
 
 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
-				   int page_size);
+				   int page_size, unsigned int flags);
 #endif
 
 /*
@@ -431,13 +458,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 {
-	if (__tlb_remove_page_size(tlb, page, page_size))
+	if (__tlb_remove_page_size(tlb, page, page_size, 0))
 		tlb_flush_mmu(tlb);
 }
 
-static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags)
 {
-	return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
+	return __tlb_remove_page_size(tlb, page, PAGE_SIZE, flags);
 }
 
 /* tlb_remove_page
author	Linus Torvalds <torvalds@linux-foundation.org>	2022-10-29 11:45:07 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2022-11-02 16:10:30 -0700
commit	28154ddc676efa64e8e792389787eb85199d2772 (patch)
tree	6bf1a0ba9189c8e904a91b71e01ec5e44651c8a3 /include/asm-generic/tlb.h
parent	655d4bdee63563392b0e5fb40f973c6d41658070 (diff)
download	linux-28154ddc676efa64e8e792389787eb85199d2772.tar.bz2