summaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c177
1 files changed, 146 insertions, 31 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e32360e90274..5e90b3fb7794 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -31,6 +31,9 @@
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
@@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
return 0;
}
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
- if (nodes_empty(*nodes))
- return -EINVAL;
- pol->nodes = *nodes;
- return 0;
-}
-
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
@@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
- .create = mpol_new_interleave,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
@@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
- .create = mpol_new_bind,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_preferred,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
@@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1446,7 +1446,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)(*mode) >= MPOL_MAX)
+
+ if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
@@ -1875,16 +1876,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
+ int mode = policy->mode;
+
/* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ if (unlikely(mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
+
+ if (mode == MPOL_PREFERRED_MANY)
return &policy->nodes;
return NULL;
}
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED) {
@@ -1922,7 +1934,7 @@ unsigned int mempolicy_slab_node(void)
struct mempolicy *policy;
int node = numa_mem_id();
- if (in_interrupt())
+ if (!in_task())
return node;
policy = current->mempolicy;
@@ -1936,7 +1948,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND: {
+ case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ {
struct zoneref *z;
/*
@@ -2008,12 +2022,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
@@ -2021,16 +2035,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
+ int mode;
*mpol = get_vma_policy(vma, addr);
- *nodemask = NULL; /* assume !MPOL_BIND */
+ *nodemask = NULL;
+ mode = (*mpol)->mode;
- if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
+ if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
@@ -2063,6 +2079,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
@@ -2128,6 +2145,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
return page;
}
+static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
+ int nid, struct mempolicy *pol)
+{
+ struct page *page;
+ gfp_t preferred_gfp;
+
+ /*
+ * This is a two pass approach. The first pass will only try the
+ * preferred nodes but skip the direct reclaim and allow the
+ * allocation to fail, while the second pass will try all the
+ * nodes in system.
+ */
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+ if (!page)
+ page = __alloc_pages(gfp, order, numa_node_id(), NULL);
+
+ return page;
+}
+
/**
* alloc_pages_vma - Allocate a page for a VMA.
* @gfp: GFP flags.
@@ -2163,6 +2201,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
+ if (pol->mode == MPOL_PREFERRED_MANY) {
+ page = alloc_pages_preferred_many(gfp, order, node, pol);
+ mpol_cond_put(pol);
+ goto out;
+ }
+
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
@@ -2173,7 +2217,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
- * If the policy is interleave, or does not allow the current
+ * If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED)
@@ -2240,6 +2284,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else if (pol->mode == MPOL_PREFERRED_MANY)
+ page = alloc_pages_preferred_many(gfp, order,
+ numa_node_id(), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
@@ -2311,6 +2358,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2425,8 +2473,8 @@ static void sp_free(struct sp_node *n)
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
- * Return: -1 if the page is in a node that is valid for this policy, or a
- * suitable node ID to allocate a replacement page from.
+ * Return: NUMA_NO_NODE if the page is in a node that is valid for this
+ * policy, or a suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
@@ -2437,7 +2485,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
- int ret = -1;
+ int ret = NUMA_NO_NODE;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
@@ -2451,6 +2499,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
+ if (node_isset(curnid, pol->nodes))
+ goto out;
polnid = first_node(pol->nodes);
break;
@@ -2465,9 +2515,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
goto out;
}
+ fallthrough;
+ case MPOL_PREFERRED_MANY:
/*
- * allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
@@ -2829,6 +2880,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2907,6 +2959,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -2993,6 +3046,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;
@@ -3021,3 +3075,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ numa_demotion_enabled = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ numa_demotion_enabled = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif