summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@engr.sgi.com>2005-10-29 18:16:59 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-29 21:40:45 -0700
commit8bccd85ffbaf8ff1448d1235fa6594e207695531 (patch)
treed5ed1f3b2ba1d301c74cc0a62ed416e634c5bebb
parentbb7e7e032d2cb8e0e9a88a2be209de5e61033b39 (diff)
downloadlinux-8bccd85ffbaf8ff1448d1235fa6594e207695531.tar.bz2
[PATCH] Implement sys_* do_* layering in the memory policy layer.
- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions take variable sized bitmaps from user space as arguments. do_xxx functions take fixed sized nodemask_t as arguments and may be used from inside the kernel. Doing so simplifies the initialization code. There is no fs = kernel_ds assumption anymore. - Split up get_nodes into get_nodes (which gets the node list) and contextualize_policy which restricts the nodes to those accessible to the task and updates cpusets. - Add comments explaining limitations of bind policy Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/mempolicy.c276
1 files changed, 162 insertions, 114 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 902d4c9eccdc..123925f50f86 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
+ * FIXME: memory is allocated starting with the first node
+ * to the last. It would be better if bind would truly restrict
+ * the allocation to memory nodes instead
+ *
* preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
-
-/* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
- unsigned long maxnode, int mode)
-{
- unsigned long k;
- unsigned long nlongs;
- unsigned long endmask;
-
- --maxnode;
- nodes_clear(*nodes);
- if (maxnode == 0 || !nmask)
- return 0;
-
- nlongs = BITS_TO_LONGS(maxnode);
- if ((maxnode % BITS_PER_LONG) == 0)
- endmask = ~0UL;
- else
- endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
-
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
- if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
- for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
- if (get_user(t, nmask + k))
- return -EFAULT;
- if (k == nlongs - 1) {
- if (t & endmask)
- return -EINVAL;
- } else if (t)
- return -EINVAL;
- }
- nlongs = BITS_TO_LONGS(MAX_NUMNODES);
- endmask = ~0UL;
- }
-
- if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
- return -EFAULT;
- nodes_addr(*nodes)[nlongs-1] &= endmask;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- /* AK: shouldn't this error out instead? */
- cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
- return mpol_check_policy(mode, nodes);
-}
-
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err;
}
-/* Change policy for a memory range */
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+static int contextualize_policy(int mode, nodemask_t *nodes)
+{
+ if (!nodes)
+ return 0;
+
+ /* Update current mems_allowed */
+ cpuset_update_current_mems_allowed();
+ /* Ignore nodes not set in current->mems_allowed */
+ cpuset_restrict_to_mems_allowed(nodes->bits);
+ return mpol_check_policy(mode, nodes);
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
- nodemask_t nodes;
int err;
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
return -EINVAL;
if (end == start)
return 0;
-
- err = get_nodes(&nodes, nmask, maxnode, mode);
- if (err)
- return err;
-
- new = mpol_new(mode, &nodes);
+ if (contextualize_policy(mode, nmask))
+ return -EINVAL;
+ new = mpol_new(mode, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, &nodes, flags);
+ vma = check_range(mm, start, end, nmask, flags);
err = PTR_ERR(vma);
if (!IS_ERR(vma))
err = mbind_range(vma, start, end, new);
@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+long do_set_mempolicy(int mode, nodemask_t *nodes)
{
- int err;
struct mempolicy *new;
- nodemask_t nodes;
- if (mode < 0 || mode > MPOL_MAX)
+ if (contextualize_policy(mode, nodes))
return -EINVAL;
- err = get_nodes(&nodes, nmask, maxnode, mode);
- if (err)
- return err;
- new = mpol_new(mode, &nodes);
+ new = mpol_new(mode, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
- node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
+ node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+ *nodes);
break;
case MPOL_DEFAULT:
break;
@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
return err;
}
-/* Copy a kernel node mask to user space */
-static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
- nodemask_t *nodes)
-{
- unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
-
- if (copy > nbytes) {
- if (copy > PAGE_SIZE)
- return -EINVAL;
- if (clear_user((char __user *)mask + nbytes, copy - nbytes))
- return -EFAULT;
- copy = nbytes;
- }
- return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
-}
-
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
{
- int err, pval;
+ int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
- return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
err = lookup_node(mm, addr);
if (err < 0)
goto out;
- pval = err;
+ *policy = err;
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
- pval = current->il_next;
+ *policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
} else
- pval = pol->policy;
+ *policy = pol->policy;
if (vma) {
up_read(&current->mm->mmap_sem);
vma = NULL;
}
- if (policy && put_user(pval, policy))
- return -EFAULT;
-
err = 0;
- if (nmask) {
- nodemask_t nodes;
- get_zonemask(pol, &nodes);
- err = copy_nodes_to_user(nmask, maxnode, &nodes);
- }
+ if (nmask)
+ get_zonemask(pol, nmask);
out:
if (vma)
@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
return err;
}
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /* When the user specified more nodes than supported just check
+ if the non supported part is all zero. */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ unsigned long t;
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+ unsigned long mode,
+ unsigned long __user *nmask, unsigned long maxnode,
+ unsigned flags)
+{
+ nodemask_t nodes;
+ int err;
+
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ int err;
+ nodemask_t nodes;
+
+ if (mode < 0 || mode > MPOL_MAX)
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, &nodes);
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr, unsigned long flags)
+{
+ int err, pval;
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
+ return -EINVAL;
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+ if (err)
+ return err;
+
+ if (policy && put_user(pval, policy))
+ return -EFAULT;
+
+ if (nmask)
+ err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+ return err;
+}
+
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
+ pol = vma->vm_ops->get_policy(vma, addr);
else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void)
/* Set interleaving policy for system init. This way not all
the data structures allocated at system boot end up in node zero. */
- if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
- MAX_NUMNODES) < 0)
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
printk("numa_policy_init: interleaving failed\n");
}
-/* Reset policy of current process to default.
- * Assumes fs == KERNEL_DS */
+/* Reset policy of current process to default */
void numa_default_policy(void)
{
- sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+ do_set_mempolicy(MPOL_DEFAULT, NULL);
}