summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c365
1 files changed, 311 insertions, 54 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..bf91dc9e7a79 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+ info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset)
{
@@ -580,6 +592,21 @@ new_cluster:
return found_free;
}
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ for_each_node(nid)
+ plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+ spin_lock(&swap_avail_lock);
+ __del_from_avail_list(p);
+ spin_unlock(&swap_avail_lock);
+}
+
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(si);
+ }
+}
+
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ spin_lock(&swap_avail_lock);
+ for_each_node(nid) {
+ WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
+ spin_unlock(&swap_avail_lock);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
bool was_full = !si->highest_bit;
si->highest_bit = end;
- if (was_full && (si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&si->avail_list));
- if (plist_node_empty(&si->avail_list))
- plist_add(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
+ if (was_full && (si->flags & SWP_WRITEOK))
+ add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
offset = idx * SWAPFILE_CLUSTER;
ci = lock_cluster(si, offset);
alloc_cluster(si, idx);
- cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
+ int node;
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ node = numa_node_id();
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
/* requeue si to after same-priority siblings */
- plist_requeue(&si->avail_list, &swap_avail_head);
+ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_list)) {
+ if (plist_node_empty(&si->avail_lists[node])) {
spin_unlock(&si->lock);
goto nextsi;
}
@@ -934,13 +968,14 @@ start_over:
WARN(!(si->flags & SWP_WRITEOK),
"swap_info %d in list but !SWP_WRITEOK\n",
si->type);
- plist_del(&si->avail_list, &swap_avail_head);
+ __del_from_avail_list(si);
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster)
- n_ret = swap_alloc_cluster(si, swp_entries);
- else
+ if (cluster) {
+ if (!(si->flags & SWP_FILE))
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ } else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
* swap_avail_head list then try it, otherwise start over
* if we have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_list))
+ if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
struct swap_cluster_info *ci;
struct swap_info_struct *si;
unsigned char *map;
- unsigned int i;
+ unsigned int i, free_entries = 0;
+ unsigned char val;
- si = swap_info_get(entry);
+ si = _swap_info_get(entry);
if (!si)
return;
ci = lock_cluster(si, offset);
+ VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
- map[i] = 0;
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ if (!free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] &= ~SWAP_HAS_CACHE;
}
+ cluster_clear_huge(ci);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ } else if (free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+ if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+ free_swap_slot(entry);
+ }
+ }
+}
+
+int split_swap_cluster(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = _swap_info_get(entry);
+ if (!si)
+ return -EBUSY;
+ ci = lock_cluster(si, offset);
+ cluster_clear_huge(ci);
+ unlock_cluster(ci);
+ return 0;
}
#else
static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
return count;
}
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
+ unsigned long roffset = swp_offset(entry);
+ unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+ int i;
+ bool ret = false;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (!ci || !cluster_is_huge(ci)) {
+ if (map[roffset] != SWAP_HAS_CACHE)
+ ret = true;
+ goto unlock_out;
+ }
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (map[offset + i] != SWAP_HAS_CACHE) {
+ ret = true;
+ break;
+ }
+ }
+unlock_out:
+ unlock_cluster_or_swap_info(si, ci);
+ return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+
+ if (likely(!PageTransCompound(page)))
+ return page_swapcount(page) != 0;
+
+ page = compound_head(page);
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si)
+ return swap_page_trans_huge_swapped(si, entry);
+ return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int i, map_swapcount, _total_mapcount, _total_swapcount;
+ unsigned long offset = 0;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci = NULL;
+ unsigned char *map = NULL;
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = _total_swapcount = map_swapcount = 0;
+ if (PageSwapCache(page)) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si) {
+ map = si->swap_map;
+ offset = swp_offset(entry);
+ }
+ }
+ if (map)
+ ci = lock_cluster(si, offset);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ _total_mapcount += mapcount;
+ if (map) {
+ swapcount = swap_count(map[offset + i]);
+ _total_swapcount += swapcount;
+ }
+ map_swapcount = max(map_swapcount, mapcount + swapcount);
+ }
+ unlock_cluster(ci);
+ if (PageDoubleMap(page)) {
+ map_swapcount -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ map_swapcount += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ if (total_swapcount)
+ *total_swapcount = _total_swapcount;
+
+ return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
+#define page_swapped(page) (page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+}
+#endif
+
/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
*
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
*/
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
{
- int count;
+ int count, total_mapcount, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
- count = page_trans_huge_mapcount(page, total_mapcount);
- if (count <= 1 && PageSwapCache(page)) {
- count += page_swapcount(page);
- if (count != 1)
- goto out;
+ count = page_trans_huge_map_swapcount(page, &total_mapcount,
+ &total_swapcount);
+ if (total_map_swapcount)
+ *total_map_swapcount = total_mapcount + total_swapcount;
+ if (count == 1 && PageSwapCache(page) &&
+ (likely(!PageTransCompound(page)) ||
+ /* The remaining swap count will be freed soon */
+ total_swapcount == page_swapcount(page))) {
if (!PageWriteback(page)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
} else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
spin_unlock(&p->lock);
}
}
-out:
+
return count <= 1;
}
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
return 0;
if (PageWriteback(page))
return 0;
- if (page_swapcount(page))
+ if (page_swapped(page))
return 0;
/*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage())
return 0;
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
count = __swap_entry_free(p, entry, 1);
- if (count == SWAP_HAS_CACHE) {
+ if (count == SWAP_HAS_CACHE &&
+ !swap_page_trans_huge_swapped(p, entry)) {
page = find_get_page(swap_address_space(entry),
swp_offset(entry));
if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
(!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_swapcount(p, entry)) {
+ !swap_page_trans_huge_swapped(p, entry)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
.sync_mode = WB_SYNC_NONE,
};
- swap_writepage(page, &wbc);
+ swap_writepage(compound_head(page), &wbc);
lock_page(page);
wait_on_page_writeback(page);
}
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
* delete, since it may not have been written out to swap yet.
*/
if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val))
- delete_from_swap_cache(page);
+ likely(page_private(page) == entry.val) &&
+ !page_swapped(page))
+ delete_from_swap_cache(compound_head(page));
/*
* So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
+static int swap_node(struct swap_info_struct *p)
+{
+ struct block_device *bdev;
+
+ if (p->bdev)
+ bdev = p->bdev;
+ else
+ bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+ return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
+ int i;
+
if (prio >= 0)
p->prio = prio;
else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* low-to-high, while swap ordering is high-to-low
*/
p->list.prio = -p->prio;
- p->avail_list.prio = -p->prio;
+ for_each_node(i) {
+ if (p->prio >= 0)
+ p->avail_lists[i].prio = -p->prio;
+ else {
+ if (swap_node(p) == i)
+ p->avail_lists[i].prio = 1;
+ else
+ p->avail_lists[i].prio = -p->prio;
+ }
+ }
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* swap_info_struct.
*/
plist_add(&p->list, &swap_active_head);
- spin_lock(&swap_avail_lock);
- plist_add(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ add_to_avail_list(p);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- spin_lock(&swap_avail_lock);
- plist_del(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(p);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
+ int nid;
plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
si->list.prio--;
- si->avail_list.prio--;
+ for_each_node(nid) {
+ if (si->avail_lists[nid].prio != 1)
+ si->avail_lists[nid].prio--;
+ }
}
least_priority++;
}
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
+ if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ atomic_dec(&nr_rotate_swap);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
unsigned int type;
+ int i;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
plist_node_init(&p->list, 0);
- plist_node_init(&p->avail_list, 0);
+ for_each_node(i)
+ plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!swap_avail_heads)
+ return -ENOMEM;
+
p = alloc_swap_info();
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
- }
+ } else
+ atomic_inc(&nr_rotate_swap);
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
@@ -3052,7 +3290,8 @@ bad_swap:
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
- vfree(cluster_info);
+ kvfree(cluster_info);
+ kvfree(frontswap_map);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
inode_unlock(inode);
@@ -3457,3 +3696,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
}
+
+static int __init swapfile_init(void)
+{
+ int nid;
+
+ swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+ GFP_KERNEL);
+ if (!swap_avail_heads) {
+ pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+ return -ENOMEM;
+ }
+
+ for_each_node(nid)
+ plist_head_init(&swap_avail_heads[nid]);
+
+ return 0;
+}
+subsys_initcall(swapfile_init);