2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_MEMPOLICY_H
|
|
|
|
#define _LINUX_MEMPOLICY_H 1
|
|
|
|
|
|
|
|
#include <linux/errno.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NUMA memory policies for Linux.
|
|
|
|
* Copyright 2003,2004 Andi Kleen SuSE Labs
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Policies */
|
|
|
|
#define MPOL_DEFAULT 0
|
|
|
|
#define MPOL_PREFERRED 1
|
|
|
|
#define MPOL_BIND 2
|
|
|
|
#define MPOL_INTERLEAVE 3
|
|
|
|
|
|
|
|
#define MPOL_MAX MPOL_INTERLEAVE
|
|
|
|
|
|
|
|
/* Flags for get_mem_policy */
|
|
|
|
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
|
|
|
|
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
|
|
|
|
|
|
|
|
/* Flags for mbind */
|
|
|
|
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
|
2006-01-08 17:00:50 +08:00
|
|
|
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
|
|
|
|
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
|
|
|
|
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/spinlock.h>
|
2005-10-30 09:15:48 +08:00
|
|
|
#include <linux/nodemask.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct vm_area_struct;
|
2006-06-08 15:43:41 +08:00
|
|
|
struct mm_struct;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Describe a memory policy.
|
|
|
|
*
|
|
|
|
* A mempolicy can be either associated with a process or with a VMA.
|
|
|
|
* For VMA related allocations the VMA policy is preferred, otherwise
|
|
|
|
* the process policy is used. Interrupts ignore the memory policy
|
|
|
|
* of the current process.
|
|
|
|
*
|
|
|
|
* Locking policy for interlave:
|
|
|
|
* In process context there is no locking because only the process accesses
|
|
|
|
* its own state. All vma manipulation is somewhat protected by a down_read on
|
2005-10-30 09:16:41 +08:00
|
|
|
* mmap_sem.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Freeing policy:
|
|
|
|
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
|
|
|
|
* All other policies don't have any external state. mpol_free() handles this.
|
|
|
|
*
|
|
|
|
* Copying policy objects:
|
|
|
|
* For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
|
|
|
|
*/
|
|
|
|
struct mempolicy {
|
|
|
|
atomic_t refcnt;
|
|
|
|
short policy; /* See MPOL_* above */
|
|
|
|
union {
|
|
|
|
struct zonelist *zonelist; /* bind */
|
|
|
|
short preferred_node; /* preferred */
|
2005-10-30 09:15:48 +08:00
|
|
|
nodemask_t nodes; /* interleave */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* undefined for default */
|
|
|
|
} v;
|
[PATCH] cpuset: numa_policy_rebind cleanup
Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.
The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.
NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.
The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy. It just looked at the tasks
mems_allowed value. This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes. But in an upcoming patch, the vma mempolicies
will be rebound as well. Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.
So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.
Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:56 +08:00
|
|
|
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Support for managing mempolicy data objects (clone, copy, destroy)
|
|
|
|
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
|
|
|
|
*/
|
|
|
|
|
|
|
|
extern void __mpol_free(struct mempolicy *pol);
|
|
|
|
static inline void mpol_free(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
__mpol_free(pol);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern struct mempolicy *__mpol_copy(struct mempolicy *pol);
|
|
|
|
static inline struct mempolicy *mpol_copy(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
pol = __mpol_copy(pol);
|
|
|
|
return pol;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define vma_policy(vma) ((vma)->vm_policy)
|
|
|
|
#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
|
|
|
|
|
|
|
|
static inline void mpol_get(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
atomic_inc(&pol->refcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b);
|
|
|
|
static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return 1;
|
|
|
|
return __mpol_equal(a, b);
|
|
|
|
}
|
|
|
|
#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))
|
|
|
|
|
|
|
|
/* Could later add inheritance of the process policy here. */
|
|
|
|
|
|
|
|
#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tree of shared policies for a shared memory region.
|
|
|
|
* Maintain the policies in a pseudo mm that contains vmas. The vmas
|
|
|
|
* carry the policy. As a special twist the pseudo mm is indexed in pages, not
|
|
|
|
* bytes, so that we can work with shared memory segments bigger than
|
|
|
|
* unsigned long.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct sp_node {
|
|
|
|
struct rb_node nd;
|
|
|
|
unsigned long start, end;
|
|
|
|
struct mempolicy *policy;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct shared_policy {
|
|
|
|
struct rb_root root;
|
|
|
|
spinlock_t lock;
|
|
|
|
};
|
|
|
|
|
2006-01-15 05:20:48 +08:00
|
|
|
void mpol_shared_policy_init(struct shared_policy *info, int policy,
|
|
|
|
nodemask_t *nodes);
|
2005-04-17 06:20:36 +08:00
|
|
|
int mpol_set_shared_policy(struct shared_policy *info,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new);
|
|
|
|
void mpol_free_shared_policy(struct shared_policy *p);
|
|
|
|
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
|
|
|
|
unsigned long idx);
|
|
|
|
|
|
|
|
extern void numa_default_policy(void);
|
|
|
|
extern void numa_policy_init(void);
|
[PATCH] cpuset: numa_policy_rebind cleanup
Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.
The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.
NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.
The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy. It just looked at the tasks
mems_allowed value. This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes. But in an upcoming patch, the vma mempolicies
will be rebound as well. Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.
So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.
Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:56 +08:00
|
|
|
extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
|
|
|
|
extern void mpol_rebind_task(struct task_struct *tsk,
|
|
|
|
const nodemask_t *new);
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
|
2006-03-24 19:16:08 +08:00
|
|
|
extern void mpol_fix_fork_child_flag(struct task_struct *p);
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
|
|
|
|
|
2006-10-22 01:24:17 +08:00
|
|
|
#ifdef CONFIG_CPUSETS
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
#define current_cpuset_is_being_rebound() \
|
|
|
|
(cpuset_being_rebound == current->cpuset)
|
|
|
|
#else
|
|
|
|
#define current_cpuset_is_being_rebound() 0
|
|
|
|
#endif
|
|
|
|
|
2005-07-07 01:56:03 +08:00
|
|
|
extern struct mempolicy default_policy;
|
2006-01-06 16:10:46 +08:00
|
|
|
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
Fix NUMA Memory Policy Reference Counting
This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map(). Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.
Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.
Default system policy should not require additional ref counting, nor
should the current task's task policy. However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.
This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy. Again, shared policy is already reference counted on lookup. A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy. We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.
Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy. In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage(). The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.
Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:
w/o patch w/ refcount patch
Avg Std Devn Avg Std Devn
Real: 100.59 0.38 100.63 0.43
User: 1209.60 0.37 1209.91 0.31
System: 81.52 0.42 81.64 0.34
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 13:46:47 +08:00
|
|
|
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
|
2006-01-19 09:42:36 +08:00
|
|
|
extern unsigned slab_node(struct mempolicy *policy);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:31:18 +08:00
|
|
|
extern enum zone_type policy_zone;
|
2006-01-06 16:11:17 +08:00
|
|
|
|
2006-09-26 14:31:18 +08:00
|
|
|
static inline void check_highest_zone(enum zone_type k)
|
2006-01-06 16:11:17 +08:00
|
|
|
{
|
2007-08-23 05:02:05 +08:00
|
|
|
if (k > policy_zone && k != ZONE_MOVABLE)
|
2006-01-06 16:11:17 +08:00
|
|
|
policy_zone = k;
|
|
|
|
}
|
|
|
|
|
2006-01-08 17:00:51 +08:00
|
|
|
int do_migrate_pages(struct mm_struct *mm,
|
|
|
|
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
|
|
|
|
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
|
|
|
|
|
|
|
struct mempolicy {};
|
|
|
|
|
|
|
|
static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#define vma_mpol_equal(a,b) 1
|
|
|
|
|
|
|
|
#define mpol_set_vma_default(vma) do {} while(0)
|
|
|
|
|
|
|
|
static inline void mpol_free(struct mempolicy *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_get(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *mpol_copy(struct mempolicy *old)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct shared_policy {};
|
|
|
|
|
|
|
|
static inline int mpol_set_shared_policy(struct shared_policy *info,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2006-01-15 05:20:48 +08:00
|
|
|
static inline void mpol_shared_policy_init(struct shared_policy *info,
|
|
|
|
int policy, nodemask_t *nodes)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_free_shared_policy(struct shared_policy *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *
|
|
|
|
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define vma_policy(vma) NULL
|
|
|
|
#define vma_set_policy(vma, pol) do {} while(0)
|
|
|
|
|
|
|
|
static inline void numa_policy_init(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void numa_default_policy(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: numa_policy_rebind cleanup
Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.
The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.
NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.
The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy. It just looked at the tasks
mems_allowed value. This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes. But in an upcoming patch, the vma mempolicies
will be rebound as well. Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.
So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.
Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:56 +08:00
|
|
|
static inline void mpol_rebind_policy(struct mempolicy *pol,
|
|
|
|
const nodemask_t *new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_rebind_task(struct task_struct *tsk,
|
[PATCH] cpusets: automatic numa mempolicy rebinding
This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes. It does so within the context of the task,
without any need to support low level external mempolicy manipulation.
If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op. Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply. Otherwise, the main routine below,
rebind_policy() is not even called.
When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.
However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.
This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.
Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position. This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity. This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.
However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed. All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy. The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice. The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.
This patch leaves open a couple of issues:
1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:
These mempolicies may reference nodes outside of those allowed to
the current task by its cpuset. Tasks are migrated as part of jobs,
which reside on what might be several cpusets in a subtree. When such
a job is migrated, all NUMA memory policy references to nodes within
that cpuset subtree should be translated, and references to any nodes
outside that subtree should be left untouched. A future patch will
provide the cpuset mechanism needed to mark such subtrees. With that
patch, we will be able to correctly migrate these other memory policies
across a job migration.
2) Updating cpuset, affinity and memory policies in user space:
This is harder. Any placement state stored in user space using
system-wide numbering will be invalidated across a migration. More
work will be required to provide user code with a migration-safe means
to manage its cpuset relative placement, while preserving the current
API's that pass system wide numbers, not cpuset relative numbers across
the kernel-user boundary.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 07:02:36 +08:00
|
|
|
const nodemask_t *new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2006-03-24 19:16:08 +08:00
|
|
|
static inline void mpol_fix_fork_child_flag(struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:59 +08:00
|
|
|
#define set_cpuset_being_rebound(x) do {} while (0)
|
|
|
|
|
2006-01-06 16:10:46 +08:00
|
|
|
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
Fix NUMA Memory Policy Reference Counting
This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map(). Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.
Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.
Default system policy should not require additional ref counting, nor
should the current task's task policy. However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.
This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy. Again, shared policy is already reference counted on lookup. A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy. We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.
Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy. In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage(). The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.
Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:
w/o patch w/ refcount patch
Avg Std Devn Avg Std Devn
Real: 100.59 0.38 100.63 0.43
User: 1209.60 0.37 1209.91 0.31
System: 81.52 0.42 81.64 0.34
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 13:46:47 +08:00
|
|
|
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
|
2006-01-06 16:10:46 +08:00
|
|
|
{
|
2007-07-17 19:03:13 +08:00
|
|
|
return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags);
|
2006-01-06 16:10:46 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] cpusets: swap migration interface
Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.
It defaults to false (file contains "0"). It can be set true by writing
"1" to the file.
If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.
Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:00:56 +08:00
|
|
|
static inline int do_migrate_pages(struct mm_struct *mm,
|
|
|
|
const nodemask_t *from_nodes,
|
|
|
|
const nodemask_t *to_nodes, int flags)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-01-06 16:11:17 +08:00
|
|
|
static inline void check_highest_zone(int k)
|
|
|
|
{
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
|
|
|
#endif
|