[PATCH] Implement sys_* do_* layering in the memory policy layer.

- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions
  take variable sized bitmaps from user space as arguments. do_xxx functions
  take fixed sized nodemask_t as arguments and may be used from inside the
  kernel. Doing so simplifies the initialization code. There is no
  fs = kernel_ds assumption anymore.

- Split up get_nodes into get_nodes (which gets the node list) and
  contextualize_policy which restricts the nodes to those accessible
  to the task and updates cpusets.

- Add comments explaining limitations of bind policy

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Christoph Lameter 2005-10-29 18:16:59 -07:00 committed by Linus Torvalds
parent bb7e7e032d
commit 8bccd85ffb
1 changed files with 162 additions and 114 deletions

View File

@ -2,6 +2,7 @@
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
@ -17,13 +18,19 @@
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
unsigned long maxnode, int mode)
{
unsigned long k;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/* When the user specified more nodes than supported just check
if the non supported part is all zero. */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
if (nlongs > PAGE_SIZE/sizeof(long))
return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
/* Update current mems_allowed */
cpuset_update_current_mems_allowed();
/* Ignore nodes not set in current->mems_allowed */
/* AK: shouldn't this error out instead? */
cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
return mpol_check_policy(mode, nodes);
}
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err;
}
/* Change policy for a memory range */
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
unsigned long mode,
unsigned long __user *nmask, unsigned long maxnode,
unsigned flags)
static int contextualize_policy(int mode, nodemask_t *nodes)
{
if (!nodes)
return 0;
/* Update current mems_allowed */
cpuset_update_current_mems_allowed();
/* Ignore nodes not set in current->mems_allowed */
cpuset_restrict_to_mems_allowed(nodes->bits);
return mpol_check_policy(mode, nodes);
}
long do_mbind(unsigned long start, unsigned long len,
unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
nodemask_t nodes;
int err;
if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
return -EINVAL;
if (end == start)
return 0;
err = get_nodes(&nodes, nmask, maxnode, mode);
if (err)
return err;
new = mpol_new(mode, &nodes);
if (contextualize_policy(mode, nmask))
return -EINVAL;
new = mpol_new(mode, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
mode,nodes_addr(nodes)[0]);
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, &nodes, flags);
vma = check_range(mm, start, end, nmask, flags);
err = PTR_ERR(vma);
if (!IS_ERR(vma))
err = mbind_range(vma, start, end, new);
@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
}
/* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
unsigned long maxnode)
long do_set_mempolicy(int mode, nodemask_t *nodes)
{
int err;
struct mempolicy *new;
nodemask_t nodes;
if (mode < 0 || mode > MPOL_MAX)
if (contextualize_policy(mode, nodes))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode, mode);
if (err)
return err;
new = mpol_new(mode, &nodes);
new = mpol_new(mode, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
*nodes);
break;
case MPOL_DEFAULT:
break;
@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
return err;
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr, unsigned long flags)
long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err, pval;
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
if (nmask != NULL && maxnode < MAX_NUMNODES)
return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
err = lookup_node(mm, addr);
if (err < 0)
goto out;
pval = err;
*policy = err;
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
pval = current->il_next;
*policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
} else
pval = pol->policy;
*policy = pol->policy;
if (vma) {
up_read(&current->mm->mmap_sem);
vma = NULL;
}
if (policy && put_user(pval, policy))
return -EFAULT;
err = 0;
if (nmask) {
nodemask_t nodes;
get_zonemask(pol, &nodes);
err = copy_nodes_to_user(nmask, maxnode, &nodes);
}
if (nmask)
get_zonemask(pol, nmask);
out:
if (vma)
@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/* When the user specified more nodes than supported just check
if the non supported part is all zero. */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
if (nlongs > PAGE_SIZE/sizeof(long))
return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
return 0;
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
unsigned long mode,
unsigned long __user *nmask, unsigned long maxnode,
unsigned flags)
{
nodemask_t nodes;
int err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, mode, &nodes, flags);
}
/* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
unsigned long maxnode)
{
int err;
nodemask_t nodes;
if (mode < 0 || mode > MPOL_MAX)
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(mode, &nodes);
}
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr, unsigned long flags)
{
int err, pval;
nodemask_t nodes;
if (nmask != NULL && maxnode < MAX_NUMNODES)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
pol = vma->vm_ops->get_policy(vma, addr);
else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
@ -1147,14 +1197,12 @@ void __init numa_policy_init(void)
/* Set interleaving policy for system init. This way not all
the data structures allocated at system boot end up in node zero. */
if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
MAX_NUMNODES) < 0)
if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
printk("numa_policy_init: interleaving failed\n");
}
/* Reset policy of current process to default.
* Assumes fs == KERNEL_DS */
/* Reset policy of current process to default */
void numa_default_policy(void)
{
sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
do_set_mempolicy(MPOL_DEFAULT, NULL);
}