summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm.h2
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memory.c17
-rw-r--r--mm/oom_kill.c151
4 files changed, 162 insertions, 13 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 450fc977ed02..ed6407d1b7b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1132,6 +1132,8 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
+ bool ignore_dirty; /* Ignore dirty pages */
+ bool check_swap_entries; /* Check also swap entries */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/internal.h b/mm/internal.h
index 7449392c6faa..b79abb6721cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,6 +38,11 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
+
extern int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size);
diff --git a/mm/memory.c b/mm/memory.c
index 81dca0083fcd..098f00d05461 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1102,6 +1102,12 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
+ /*
+ * oom_reaper cannot tear down dirty
+ * pages
+ */
+ if (unlikely(details && details->ignore_dirty))
+ continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1120,8 +1126,8 @@ again:
}
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
+ /* only check swap_entries if explicitly asked for in details */
+ if (unlikely(details && !details->check_swap_entries))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping)
- details = NULL;
-
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details;
+ struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 06f7e1707847..f7ed6ece0719 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static struct mm_struct *mm_to_reap;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+
+static bool __oom_reap_vmas(struct mm_struct *mm)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ struct zap_details details = {.check_swap_entries = true,
+ .ignore_dirty = true};
+ bool ret = true;
+
+ /* We might have raced with exit path */
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ return true;
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ ret = false;
+ goto out;
+ }
+
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /*
+ * mlocked VMAs require explicit munlocking before unmap.
+ * Let's keep it simple here and skip such VMAs.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+ &details);
+ }
+ tlb_finish_mmu(&tlb, 0, -1);
+ up_read(&mm->mmap_sem);
+out:
+ mmput(mm);
+ return ret;
+}
+
+static void oom_reap_vmas(struct mm_struct *mm)
+{
+ int attempts = 0;
+
+ /* Retry the down_read_trylock(mmap_sem) a few times */
+ while (attempts++ < 10 && !__oom_reap_vmas(mm))
+ schedule_timeout_idle(HZ/10);
+
+ /* Drop a reference taken by wake_oom_reaper */
+ mmdrop(mm);
+}
+
+static int oom_reaper(void *unused)
+{
+ while (true) {
+ struct mm_struct *mm;
+
+ wait_event_freezable(oom_reaper_wait,
+ (mm = READ_ONCE(mm_to_reap)));
+ oom_reap_vmas(mm);
+ WRITE_ONCE(mm_to_reap, NULL);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct mm_struct *mm)
+{
+ struct mm_struct *old_mm;
+
+ if (!oom_reaper_th)
+ return;
+
+ /*
+ * Pin the given mm. Use mm_count instead of mm_users because
+ * we do not want to delay the address space tear down.
+ */
+ atomic_inc(&mm->mm_count);
+
+ /*
+ * Make sure that only a single mm is ever queued for the reaper
+ * because multiple are not necessary and the operation might be
+ * disruptive so better reduce it to the bare minimum.
+ */
+ old_mm = cmpxchg(&mm_to_reap, NULL, mm);
+ if (!old_mm)
+ wake_up(&oom_reaper_wait);
+ else
+ mmdrop(mm);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ if (IS_ERR(oom_reaper_th)) {
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+ PTR_ERR(oom_reaper_th));
+ oom_reaper_th = NULL;
+ }
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct mm_struct *mm)
+{
+}
+#endif
+
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ bool can_oom_reap = true;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD))
- continue;
- if (is_global_init(p))
- continue;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ /*
+ * We cannot use oom_reaper for the mm shared by this
+ * process because it wouldn't get killed and so the
+ * memory might be still used.
+ */
+ can_oom_reap = false;
continue;
-
+ }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
+ if (can_oom_reap)
+ wake_oom_reaper(mm);
+
mmdrop(mm);
put_task_struct(victim);
}