diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 177 |
1 files changed, 146 insertions, 31 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e90274..5e90b3fb7794 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } -static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; @@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes @@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { - .create = mpol_new_interleave, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { @@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { - .create = mpol_new_bind, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: *nodes = p->nodes; break; case MPOL_LOCAL: @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } @@ -1446,7 +1446,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1875,16 +1876,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + if (unlikely(mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; + + if (mode == MPOL_PREFERRED_MANY) return &policy->nodes; return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED) { @@ -1922,7 +1934,7 @@ unsigned int mempolicy_slab_node(void) struct mempolicy *policy; int node = numa_mem_id(); - if (in_interrupt()) + if (!in_task()) return node; policy = current->mempolicy; @@ -1936,7 +1948,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -2008,12 +2022,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2021,16 +2035,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->nodes; } return nid; @@ -2063,6 +2079,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; @@ -2128,6 +2145,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + if (!page) + page = __alloc_pages(gfp, order, numa_node_id(), NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2163,6 +2201,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2173,7 +2217,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED) @@ -2240,6 +2284,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + numa_node_id(), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2311,6 +2358,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2425,8 +2473,8 @@ static void sp_free(struct sp_node *n) * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * - * Return: -1 if the page is in a node that is valid for this policy, or a - * suitable node ID to allocate a replacement page from. + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { @@ -2437,7 +2485,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; - int ret = -1; + int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) @@ -2451,6 +2499,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: + if (node_isset(curnid, pol->nodes)) + goto out; polnid = first_node(pol->nodes); break; @@ -2465,9 +2515,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; goto out; } + fallthrough; + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2829,6 +2880,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2907,6 +2959,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -2993,6 +3046,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_LOCAL: break; case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; @@ -3021,3 +3075,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif |