diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-12-14 17:58:46 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 08:53:15 -0800 |
commit | 570a335b8e22579e2a51a68136d2b1f907a20eec (patch) | |
tree | c5312383e948d2e7ac60c2fa410fee98e8b38a70 /mm/memory.c | |
parent | 8d69aaee80c123b460918816cbfa2e83224c3646 (diff) | |
download | linux-570a335b8e22579e2a51a68136d2b1f907a20eec.tar.bz2 |
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..543c446bf4ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -572,7 +572,7 @@ out: * covered by this vma. */ -static inline void +static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) @@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) + return entry.val; + /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[2]; + swp_entry_t entry = (swp_entry_t){0}; again: rss[1] = rss[0] = 0; @@ -674,7 +678,10 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); + if (entry.val) + break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -684,6 +691,12 @@ again: add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); + + if (entry.val) { + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + return -ENOMEM; + progress = 0; + } if (addr != end) goto again; return 0; |