binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot

In both binfmt_elf and binfmt_elf_fdpic, use a new helper dump_vma_snapshot() to take a snapshot of the VMA list (including the gate VMA, if we have one) while protected by the mmap_lock, and then use that snapshot instead of walking the VMA list without locking. An alternative approach would be to keep the mmap_lock held across the entire core dumping operation; however, keeping the mmap_lock locked while we may be blocked for an unbounded amount of time (e.g. because we're dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock blocks things like the ->release handler of userfaultfd, and we don't really want critical system daemons to grind to a halt just because someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or something like that. Since both the normal ELF code and the FDPIC ELF code need this functionality (and if any other binfmt wants to add coredump support in the future, they'd probably need it, too), implement this with a common helper in fs/coredump.c. A downside of this approach is that we now need a bigger amount of kernel memory per userspace VMA in the normal ELF case, and that we need O(n) kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't be terribly bad. There currently is a data race between stack expansion and anything that reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to mitigate that for core dumping, take the mmap_lock in write mode when taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in read mode, we could end up with a corrupted core dump if someone does get_user_pages_remote() concurrently. Not really a major problem, but taking the mmap_lock either way works here, so we might as well avoid the issue.) (This doesn't do anything about the existing data races with stack expansion in other mm code.) Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: "Eric W . Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Hugh Dickins <hughd@google.com> Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Jann Horn <jannh@google.com> 2020-10-15 20:12:54 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2020-10-16 11:11:21 -0700
commit: a07279c9a8cd7dbd321640ff7210591599ee00a4 (patch)
tree: c1f27d5713449e3ec7762dcaddae067a19f2c145 /fs/binfmt_elf_fdpic.c
parent: 429a22e776a2b9f85a2b9c53d8e647598b553dd1 (diff)
download: linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.tar.bz2
1 files changed, 27 insertions, 40 deletions
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f531c6198864..be4062b8ba75 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1454,29 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
 /*
  * dump the segments for an MMU process
  */
-static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
+				    struct core_vma_metadata *vma_meta,
+				    int vma_count)
 {
-	struct vm_area_struct *vma;
+	int i;
 
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-		unsigned long size = vma_dump_size(vma, cprm->mm_flags);
+	for (i = 0; i < vma_count; i++) {
+		struct core_vma_metadata *meta = vma_meta + i;
 
-		if (!dump_user_range(cprm, vma->vm_start, size))
+		if (!dump_user_range(cprm, meta->start, meta->dump_size))
 			return false;
 	}
 	return true;
 }
 
-static size_t elf_core_vma_data_size(unsigned long mm_flags)
-{
-	struct vm_area_struct *vma;
-	size_t size = 0;
-
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
-		size += vma_dump_size(vma, mm_flags);
-	return size;
-}
-
 /*
  * Actual dumper
  *
@@ -1487,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
 static int elf_fdpic_core_dump(struct coredump_params *cprm)
 {
 	int has_dumped = 0;
-	int segs;
+	int vma_count, segs;
 	int i;
-	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
 	loff_t offset = 0, dataoff;
 	struct memelfnote psinfo_note, auxv_note;
@@ -1503,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	elf_addr_t e_shoff;
 	struct core_thread *ct;
 	struct elf_thread_status *tmp;
-
-	/*
-	 * We no longer stop all VM operations.
-	 *
-	 * This is because those proceses that could possibly change map_count
-	 * or the mmap / vma pages are now blocked in do_exit on current
-	 * finishing this core dump.
-	 *
-	 * Only ptrace can touch these memory addresses, but it doesn't change
-	 * the map_count or the pages allocated. So no possibility of crashing
-	 * exists while dumping the mm->vm_next areas to the core file.
-	 */
+	struct core_vma_metadata *vma_meta = NULL;
+	size_t vma_data_size;
 
 	/* alloc memory for large data structures: too large to be on stack */
 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1524,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	if (!psinfo)
 		goto end_coredump;
 
+	if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+		goto end_coredump;
+
 	for (ct = current->mm->core_state->dumper.next;
 					ct; ct = ct->next) {
 		tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1543,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	tmp->next = thread_list;
 	thread_list = tmp;
 
-	segs = current->mm->map_count;
-	segs += elf_core_extra_phdrs();
+	segs = vma_count + elf_core_extra_phdrs();
 
 	/* for notes section */
 	segs++;
@@ -1589,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	/* Page-align dumped data */
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
-	offset += elf_core_vma_data_size(cprm->mm_flags);
+	offset += vma_data_size;
 	offset += elf_core_extra_data_size();
 	e_shoff = offset;
 
@@ -1609,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	/* write program headers for segments dump */
-	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+	for (i = 0; i < vma_count; i++) {
+		struct core_vma_metadata *meta = vma_meta + i;
 		struct elf_phdr phdr;
 		size_t sz;
 
-		sz = vma->vm_end - vma->vm_start;
+		sz = meta->end - meta->start;
 
 		phdr.p_type = PT_LOAD;
 		phdr.p_offset = offset;
-		phdr.p_vaddr = vma->vm_start;
+		phdr.p_vaddr = meta->start;
 		phdr.p_paddr = 0;
-		phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
+		phdr.p_filesz = meta->dump_size;
 		phdr.p_memsz = sz;
 		offset += phdr.p_filesz;
-		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
-		if (vma->vm_flags & VM_WRITE)
+		phdr.p_flags = 0;
+		if (meta->flags & VM_READ)
+			phdr.p_flags |= PF_R;
+		if (meta->flags & VM_WRITE)
 			phdr.p_flags |= PF_W;
-		if (vma->vm_flags & VM_EXEC)
+		if (meta->flags & VM_EXEC)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
@@ -1657,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	if (!dump_skip(cprm, dataoff - cprm->pos))
 		goto end_coredump;
 
-	if (!elf_fdpic_dump_segments(cprm))
+	if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
 		goto end_coredump;
 
 	if (!elf_core_write_extra_data(cprm))
@@ -1681,6 +1667,7 @@ end_coredump:
 		thread_list = thread_list->next;
 		kfree(tmp);
 	}
+	kvfree(vma_meta);
 	kfree(phdr4note);
 	kfree(elf);
 	kfree(psinfo);
author	Jann Horn <jannh@google.com>	2020-10-15 20:12:54 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2020-10-16 11:11:21 -0700
commit	a07279c9a8cd7dbd321640ff7210591599ee00a4 (patch)
tree	c1f27d5713449e3ec7762dcaddae067a19f2c145 /fs/binfmt_elf_fdpic.c
parent	429a22e776a2b9f85a2b9c53d8e647598b553dd1 (diff)
download	linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.tar.bz2