summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c141
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c25
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/rmap.c111
-rw-r--r--mm/shmem.c74
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c29
-rw-r--r--mm/vmscan.c89
10 files changed, 331 insertions, 174 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf7d027a8844..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET (128)
#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
- unsigned long next_scan_node_update;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
#endif
/*
* Should the accounting and control be hierarchical, per subtree?
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
return val;
}
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
- long ret;
-
- ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
- ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
- return ret;
-}
-
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
+ case MEM_CGROUP_TARGET_NUMAINFO:
+ next = val + NUMAINFO_EVENTS_TARGET;
+ break;
default:
return;
}
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
mem_cgroup_threshold(mem);
__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
if (unlikely(__memcg_event_check(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT))){
+ MEM_CGROUP_TARGET_SOFTLIMIT))) {
mem_cgroup_update_tree(mem, page);
__mem_cgroup_target_update(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT);
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ }
+#if MAX_NUMNODES > 1
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_NUMAINFO))) {
+ atomic_inc(&mem->numainfo_events);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_NUMAINFO);
}
+#endif
}
}
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
return MEM_CGROUP_ZSTAT(mz, lru);
}
-#ifdef CONFIG_NUMA
static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
int nid)
{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
return ret;
}
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+ int nid)
+{
+ unsigned long ret;
+
+ ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+ mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+ return ret;
+}
+
+#if MAX_NUMNODES > 1
static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
{
u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
return total;
}
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
- int nid)
-{
- unsigned long ret;
-
- ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
- mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-
- return ret;
-}
-
static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
{
u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
return ret;
}
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+ int nid, bool noswap)
+{
+ if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+ return true;
+ if (noswap || !total_swap_pages)
+ return false;
+ if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+ return true;
+ return false;
+
+}
#if MAX_NUMNODES > 1
/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
{
int nid;
-
- if (time_after(mem->next_scan_node_update, jiffies))
+ /*
+ * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+ * pagein/pageout changes since the last update.
+ */
+ if (!atomic_read(&mem->numainfo_events))
+ return;
+ if (atomic_inc_return(&mem->numainfo_updating) > 1)
return;
- mem->next_scan_node_update = jiffies + 10*HZ;
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
- if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
- continue;
-
- if (total_swap_pages &&
- (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
- mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
- continue;
- node_clear(nid, mem->scan_nodes);
+ if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+ node_clear(nid, mem->scan_nodes);
}
+
+ atomic_set(&mem->numainfo_events, 0);
+ atomic_set(&mem->numainfo_updating, 0);
}
/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
return node;
}
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(mem->scan_nodes)) {
+ for (nid = first_node(mem->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, mem->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ if (node_isset(nid, mem->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
return 0;
}
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
#endif
/*
@@ -1701,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
}
}
}
- if (!mem_cgroup_local_usage(victim)) {
+ if (!mem_cgroup_reclaimable(victim, noswap)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eac0ba561491..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -391,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct anon_vma *av;
- read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
@@ -408,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma(av);
}
/*
@@ -424,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
- read_lock(&tasklist_lock);
mutex_lock(&mapping->i_mmap_mutex);
+ read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -454,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- mutex_unlock(&mapping->i_mmap_mutex);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 87d935333f0d..9b8a01d941cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return 0;
+ batch = tlb->active;
}
VM_BUG_ON(batch->nr > batch->max);
@@ -2798,30 +2799,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * If the underlying filesystem is not going to provide
- * a way to truncate a range of blocks (punch a hole) -
- * we should return failure right now.
- */
- if (!inode->i_op->truncate_range)
- return -ENOSYS;
-
- mutex_lock(&inode->i_mutex);
- down_write(&inode->i_alloc_sem);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- truncate_inode_pages_range(mapping, offset, end);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- inode->i_op->truncate_range(inode, offset, end);
- up_write(&inode->i_alloc_sem);
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 02159c755136..c46887b5a11e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -498,7 +498,9 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* The node we allocated has no zone fallback lists. For avoiding
* to access not-initialized zonelist, build here.
*/
+ mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
return pgdat;
}
@@ -521,7 +523,7 @@ int mem_online_node(int nid)
lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
- if (pgdat) {
+ if (!pgdat) {
ret = -ENOMEM;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..9edc897a3970 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1813,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return NULL;
}
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
- unsigned long to, unsigned long size, pgprot_t prot)
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
- vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88dd..23295f65ae43 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -38,9 +38,8 @@
* in arch-dependent flush_dcache_mmap_lock,
* within inode_wb_list_lock in __sync_single_inode)
*
- * (code doesn't rely on that order so it could be switched around)
- * ->tasklist_lock
- * anon_vma->mutex (memory_failure, collect_procs_anon)
+ * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
* pte map lock
*/
@@ -112,9 +111,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
kmem_cache_free(anon_vma_cachep, anon_vma);
}
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
- return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
@@ -200,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ mutex_unlock(&root->mutex);
+ root = new_root;
+ mutex_lock(&root->mutex);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ mutex_unlock(&root->mutex);
+}
+
static void anon_vma_chain_link(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
@@ -208,13 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_lock(anon_vma);
/*
* It's critical to add new vmas to the tail of the anon_vma,
* see comment in huge_memory.c:__split_huge_page().
*/
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- anon_vma_unlock(anon_vma);
}
/*
@@ -224,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- avc = anon_vma_chain_alloc();
- if (!avc)
- goto enomem_failure;
- anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
}
+ unlock_anon_vma_root(root);
return 0;
enomem_failure:
@@ -263,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -280,7 +314,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_lock(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_unlock(anon_vma);
return 0;
@@ -291,36 +327,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
return -ENOMEM;
}
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
-{
- struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
- int empty;
-
- /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
- if (!anon_vma)
- return;
-
- anon_vma_lock(anon_vma);
- list_del(&anon_vma_chain->same_anon_vma);
-
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
- anon_vma_unlock(anon_vma);
-
- if (empty)
- put_anon_vma(anon_vma);
-}
-
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- anon_vma_unlink(avc);
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ list_del(&avc->same_anon_vma);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (list_empty(&anon_vma->head))
+ continue;
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ unlock_anon_vma_root(root);
+
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to acquire the anon_vma->root->mutex.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ put_anon_vma(anon_vma);
+
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b1..fcedf5464eb7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
} while (next);
}
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
{
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long idx;
@@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
spinlock_t *punch_lock;
unsigned long upper_limit;
+ truncate_inode_pages_range(inode->i_mapping, start, end);
+
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (idx >= info->next_index)
@@ -738,16 +740,8 @@ done2:
* lowered next_index. Also, though shmem_getpage checks
* i_size before adding to cache, no recheck after: so fix the
* narrow window there too.
- *
- * Recalling truncate_inode_pages_range and unmap_mapping_range
- * every time for punch_hole (which never got a chance to clear
- * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
- * yet hardly ever necessary: try to optimize them out later.
*/
truncate_inode_pages_range(inode->i_mapping, start, end);
- if (punch_hole)
- unmap_mapping_range(inode->i_mapping, start,
- end - start, 1);
}
spin_lock(&info->lock);
@@ -766,22 +760,23 @@ done2:
shmem_free_pages(pages_to_free.next);
}
}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- loff_t newsize = attr->ia_size;
int error;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
- && newsize != inode->i_size) {
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
struct page *page = NULL;
- if (newsize < inode->i_size) {
+ if (newsize < oldsize) {
/*
* If truncating down to a partial page, then
* if that page is already allocated, hold it
@@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
spin_unlock(&info->lock);
}
}
-
- /* XXX(truncate): truncate_setsize should be called last */
- truncate_setsize(inode, newsize);
+ if (newsize != oldsize) {
+ i_size_write(inode, newsize);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ }
+ if (newsize < oldsize) {
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ }
if (page)
page_cache_release(page);
- shmem_truncate_range(inode, newsize, (loff_t)-1);
}
setattr_copy(inode, attr);
@@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode)
struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
- truncate_inode_pages(inode->i_mapping, 0);
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = {
};
static const struct inode_operations shmem_inode_operations = {
- .setattr = shmem_notify_change,
+ .setattr = shmem_setattr,
.truncate_range = shmem_truncate_range,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
@@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.removexattr = shmem_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
+ .setattr = shmem_setattr,
.check_acl = generic_check_acl,
#endif
};
@@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = {
.removexattr = shmem_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
+ .setattr = shmem_setattr,
.check_acl = generic_check_acl,
#endif
};
@@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
return 0;
}
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+ truncate_inode_pages_range(inode->i_mapping, start, end);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/**
* mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -3028,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * Provide a stub for those callers to start using now, then later
+ * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ return read_cache_page_gfp(mapping, index, gfp);
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7b..ff8dc1a18cb4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/writeback.h>
diff --git a/mm/truncate.c b/mm/truncate.c
index 3a29a6180212..e13f22efaad7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
@@ -603,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset)
return 0;
}
EXPORT_SYMBOL(vmtruncate);
+
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ mutex_lock(&inode->i_mutex);
+ down_write(&inode->i_alloc_sem);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ inode->i_op->truncate_range(inode, offset, end);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
+ up_write(&inode->i_alloc_sem);
+ mutex_unlock(&inode->i_mutex);
+
+ return 0;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ff834e19c24..d036e59d302b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1995,14 +1995,13 @@ restart:
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
- unsigned long total_scanned = 0;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2017,19 +2016,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
}
- nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- sc->order, sc->gfp_mask,
- &nr_soft_scanned);
- sc->nr_reclaimed += nr_soft_reclaimed;
- total_scanned += nr_soft_scanned;
-
shrink_zone(priority, zone, sc);
}
-
- return total_scanned;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2094,7 +2097,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- total_scanned += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2307,7 +2310,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
- return balanced_pages > (present_pages >> 2);
+ /* A special case here: if zone has no page, we think it's balanced */
+ return balanced_pages >= (present_pages >> 2);
}
/* is kswapd sleeping prematurely? */
@@ -2323,7 +2327,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
return true;
/* Check the watermark levels */
- for (i = 0; i < pgdat->nr_zones; i++) {
+ for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
@@ -2341,7 +2345,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
}
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- classzone_idx, 0))
+ i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2448,7 +2452,6 @@ loop_again:
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
- *classzone_idx = i;
break;
}
}
@@ -2507,18 +2510,18 @@ loop_again:
KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone) + balance_gap,
- end_zone, 0))
+ end_zone, 0)) {
shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 &&
- !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
+ }
+
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2528,6 +2531,12 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
+
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
@@ -2706,8 +2715,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
static int kswapd(void *p)
{
- unsigned long order;
- int classzone_idx;
+ unsigned long order, new_order;
+ int classzone_idx, new_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2737,17 +2746,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = 0;
- classzone_idx = MAX_NR_ZONES - 1;
+ order = new_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
for ( ; ; ) {
- unsigned long new_order;
- int new_classzone_idx;
int ret;
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (classzone_idx >= new_classzone_idx && order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
+
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
@@ -2760,7 +2775,7 @@ static int kswapd(void *p)
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();