summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 23:10:54 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 23:10:54 -0800
commit2e3078af2c67730c479f1d183af5b367f5d95337 (patch)
treeb7881c6c9c479aadac345df7e18e3c0e10f0811e /mm/hugetlb.c
parentea5c58e70c3a148ada0d3061a8f529589bb766ba (diff)
parentb3b0d09c7a2330759ac293f5269bd932439ea0ff (diff)
downloadlinux-2e3078af2c67730c479f1d183af5b367f5d95337.tar.bz2
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - inotify tweaks - some ocfs2 updates (many more are awaiting review) - various misc bits - kernel/watchdog.c updates - Some of mm. I have a huge number of MM patches this time and quite a lot of it is quite difficult and much will be held over to next time. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (162 commits) selftests: vm: add tests for lock on fault mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage mm: introduce VM_LOCKONFAULT mm: mlock: add new mlock system call mm: mlock: refactor mlock, munlock, and munlockall code kasan: always taint kernel on report mm, slub, kasan: enable user tracking by default with KASAN=y kasan: use IS_ALIGNED in memory_is_poisoned_8() kasan: Fix a type conversion error lib: test_kasan: add some testcases kasan: update reference to kasan prototype repo kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile kasan: various fixes in documentation kasan: update log messages kasan: accurately determine the type of the bad access kasan: update reported bug types for kernel memory accesses kasan: update reported bug types for not user nor kernel memory accesses mm/kasan: prevent deadlock in kasan reporting mm/kasan: don't use kasan shadow pointer in generic functions mm/kasan: MODULE_VADDR is not available on all archs ...
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c139
1 files changed, 124 insertions, 15 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9cc773483624..74ef0c6a25dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1437,7 +1437,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
dissolve_free_huge_page(pfn_to_page(pfn));
}
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
+ * page from any node, and let the buddy allocator itself figure
+ * it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
+ * strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+ int order = huge_page_order(h);
+ gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * We need a VMA to get a memory policy. If we do not
+ * have one, we use the 'nid' argument.
+ *
+ * The mempolicy stuff below has some non-inlined bits
+ * and calls ->vm_ops. That makes it hard to optimize at
+ * compile-time, even when NUMA is off and it does
+ * nothing. This helps the compiler optimize it out.
+ */
+ if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+ /*
+ * If a specific node is requested, make sure to
+ * get memory from there, but only when a node
+ * is explicitly specified.
+ */
+ if (nid != NUMA_NO_NODE)
+ gfp |= __GFP_THISNODE;
+ /*
+ * Make sure to call something that can handle
+ * nid=NUMA_NO_NODE
+ */
+ return alloc_pages_node(nid, gfp, order);
+ }
+
+ /*
+ * OK, so we have a VMA. Fetch the mempolicy and try to
+ * allocate a huge page with it. We will only reach this
+ * when CONFIG_NUMA=y.
+ */
+ do {
+ struct page *page;
+ struct mempolicy *mpol;
+ struct zonelist *zl;
+ nodemask_t *nodemask;
+
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+ mpol_cond_put(mpol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+ if (page)
+ return page;
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr, int nid)
{
struct page *page;
unsigned int r_nid;
@@ -1446,6 +1521,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
return NULL;
/*
+ * Make sure that anyone specifying 'nid' is not also specifying a VMA.
+ * This makes sure the caller is picking _one_ of the modes with which
+ * we can call this function, not both.
+ */
+ if (vma || (addr != -1)) {
+ VM_WARN_ON_ONCE(addr == -1);
+ VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+ }
+ /*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
* overcommit
@@ -1478,14 +1562,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
}
spin_unlock(&hugetlb_lock);
- if (nid == NUMA_NO_NODE)
- page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
- else
- page = __alloc_pages_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+ page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
spin_lock(&hugetlb_lock);
if (page) {
@@ -1510,6 +1587,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
}
/*
+ * Allocate a huge page from 'nid'. Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+ unsigned long addr = -1;
+
+ return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
+/*
* This allocation function is useful in the context where vma is irrelevant.
* E.g. soft-offlining uses this function because it only cares physical
* address of error page.
@@ -1524,7 +1624,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (!page)
- page = alloc_buddy_huge_page(h, nid);
+ page = __alloc_buddy_huge_page_no_mpol(h, nid);
return page;
}
@@ -1554,7 +1654,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
if (!page) {
alloc_ok = false;
break;
@@ -1787,7 +1887,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
- page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
@@ -2376,7 +2476,7 @@ struct node_hstate {
struct kobject *hugepages_kobj;
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
};
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
/*
* A subset of global hstate attributes for node devices
@@ -2790,6 +2890,12 @@ void hugetlb_show_meminfo(void)
1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+ atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
@@ -3025,6 +3131,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
get_page(ptepage);
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
+ hugetlb_count_add(pages_per_huge_page(h), dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -3105,6 +3212,7 @@ again:
if (huge_pte_dirty(pte))
set_page_dirty(page);
+ hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
@@ -3509,6 +3617,7 @@ retry:
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
+ hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -4028,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+ unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
/*
* match the virtual addresses, permission and the alignment of the