2008-07-26 10:44:36 +08:00
|
|
|
#include <linux/mm.h>
|
2006-01-08 17:01:43 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/string.h>
|
2014-04-08 06:37:26 +08:00
|
|
|
#include <linux/compiler.h>
|
2011-10-16 14:01:52 +08:00
|
|
|
#include <linux/export.h>
|
2006-03-24 19:18:42 +08:00
|
|
|
#include <linux/err.h>
|
2008-07-27 06:22:28 +08:00
|
|
|
#include <linux/sched.h>
|
2012-05-31 08:17:35 +08:00
|
|
|
#include <linux/security.h>
|
2013-02-23 08:34:35 +08:00
|
|
|
#include <linux/swap.h>
|
2013-02-23 08:34:37 +08:00
|
|
|
#include <linux/swapops.h>
|
2013-11-13 07:08:31 +08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/hugetlb.h>
|
|
|
|
|
2006-03-24 19:18:42 +08:00
|
|
|
#include <asm/uaccess.h>
|
2006-01-08 17:01:43 +08:00
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 08:11:22 +08:00
|
|
|
#include "internal.h"
|
|
|
|
|
2009-04-10 21:36:00 +08:00
|
|
|
#define CREATE_TRACE_POINTS
|
2009-04-15 07:39:12 +08:00
|
|
|
#include <trace/events/kmem.h>
|
2009-04-10 21:36:00 +08:00
|
|
|
|
2006-01-08 17:01:43 +08:00
|
|
|
/**
|
|
|
|
* kstrdup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*/
|
|
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strlen(s) + 1;
|
2006-10-04 17:15:25 +08:00
|
|
|
buf = kmalloc_track_caller(len, gfp);
|
2006-01-08 17:01:43 +08:00
|
|
|
if (buf)
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrdup);
|
2006-03-24 19:18:42 +08:00
|
|
|
|
2007-07-18 09:37:02 +08:00
|
|
|
/**
|
|
|
|
* kstrndup - allocate space for and copy an existing string
|
|
|
|
* @s: the string to duplicate
|
|
|
|
* @max: read at most @max chars from @s
|
|
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
|
|
*/
|
|
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!s)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
len = strnlen(s, max);
|
|
|
|
buf = kmalloc_track_caller(len+1, gfp);
|
|
|
|
if (buf) {
|
|
|
|
memcpy(buf, s, len);
|
|
|
|
buf[len] = '\0';
|
|
|
|
}
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 14:27:20 +08:00
|
|
|
/**
|
|
|
|
* kmemdup - duplicate region of memory
|
|
|
|
*
|
|
|
|
* @src: memory region to duplicate
|
|
|
|
* @len: memory region length
|
|
|
|
* @gfp: GFP mask to use
|
|
|
|
*/
|
|
|
|
void *kmemdup(const void *src, size_t len, gfp_t gfp)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
2006-10-04 17:15:25 +08:00
|
|
|
p = kmalloc_track_caller(len, gfp);
|
[PATCH] kmemdup: introduce
One of idiomatic ways to duplicate a region of memory is
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
memcpy(dst, src, len);
which is neat code except a programmer needs to write size twice. Which
sometimes leads to mistakes. If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end. If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.
Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:
Linux:
[CRYPTO]: Fix memcpy/memset args.
[PATCH] memcpy/memset fixes
OpenBSD:
kerberosV/src/lib/asn1: der_copy.c:1.4
If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.
With kmemdup, the snippet above will be rewritten as:
dst = kmemdup(src, len, GFP_KERNEL);
if (!dst)
return -ENOMEM;
This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-01 14:27:20 +08:00
|
|
|
if (p)
|
|
|
|
memcpy(p, src, len);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kmemdup);
|
|
|
|
|
2009-04-01 06:23:16 +08:00
|
|
|
/**
|
|
|
|
* memdup_user - duplicate memory region from user space
|
|
|
|
*
|
|
|
|
* @src: source address in user space
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
|
|
|
* Returns an ERR_PTR() on failure.
|
|
|
|
*/
|
|
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
|
|
{
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
|
|
* or GFP_ATOMIC.
|
|
|
|
*/
|
|
|
|
p = kmalloc_track_caller(len, GFP_KERNEL);
|
|
|
|
if (!p)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
|
|
kfree(p);
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
|
2012-08-14 20:55:21 +08:00
|
|
|
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
|
|
|
|
gfp_t flags)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
size_t ks = 0;
|
|
|
|
|
|
|
|
if (p)
|
|
|
|
ks = ksize(p);
|
|
|
|
|
|
|
|
if (ks >= new_size)
|
|
|
|
return (void *)p;
|
|
|
|
|
|
|
|
ret = kmalloc_track_caller(new_size, flags);
|
|
|
|
if (ret && p)
|
|
|
|
memcpy(ret, p, ks);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-07-17 19:03:21 +08:00
|
|
|
/**
|
2008-07-27 08:49:33 +08:00
|
|
|
* __krealloc - like krealloc() but don't free @p.
|
2007-07-17 19:03:21 +08:00
|
|
|
* @p: object to reallocate memory for.
|
|
|
|
* @new_size: how many bytes of memory are required.
|
|
|
|
* @flags: the type of memory to allocate.
|
|
|
|
*
|
2008-07-27 08:49:33 +08:00
|
|
|
* This function is like krealloc() except it never frees the originally
|
|
|
|
* allocated buffer. Use this if you don't want to free the buffer immediately
|
|
|
|
* like, for example, with RCU.
|
2007-07-17 19:03:21 +08:00
|
|
|
*/
|
2008-07-27 08:49:33 +08:00
|
|
|
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
|
2007-07-17 19:03:21 +08:00
|
|
|
{
|
2008-07-27 08:49:33 +08:00
|
|
|
if (unlikely(!new_size))
|
2007-07-17 19:03:22 +08:00
|
|
|
return ZERO_SIZE_PTR;
|
2007-07-17 19:03:21 +08:00
|
|
|
|
2012-08-14 20:55:21 +08:00
|
|
|
return __do_krealloc(p, new_size, flags);
|
2007-10-16 16:24:46 +08:00
|
|
|
|
2008-07-27 08:49:33 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__krealloc);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* krealloc - reallocate memory. The contents will remain unchanged.
|
|
|
|
* @p: object to reallocate memory for.
|
|
|
|
* @new_size: how many bytes of memory are required.
|
|
|
|
* @flags: the type of memory to allocate.
|
|
|
|
*
|
|
|
|
* The contents of the object pointed to are preserved up to the
|
|
|
|
* lesser of the new and old sizes. If @p is %NULL, krealloc()
|
2012-10-12 03:05:10 +08:00
|
|
|
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
|
2008-07-27 08:49:33 +08:00
|
|
|
* %NULL pointer, the object pointed to is freed.
|
|
|
|
*/
|
|
|
|
void *krealloc(const void *p, size_t new_size, gfp_t flags)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
if (unlikely(!new_size)) {
|
2007-07-17 19:03:21 +08:00
|
|
|
kfree(p);
|
2008-07-27 08:49:33 +08:00
|
|
|
return ZERO_SIZE_PTR;
|
2007-07-17 19:03:21 +08:00
|
|
|
}
|
2008-07-27 08:49:33 +08:00
|
|
|
|
2012-08-14 20:55:21 +08:00
|
|
|
ret = __do_krealloc(p, new_size, flags);
|
2008-07-27 08:49:33 +08:00
|
|
|
if (ret && p != ret)
|
|
|
|
kfree(p);
|
|
|
|
|
2007-07-17 19:03:21 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(krealloc);
|
|
|
|
|
2009-02-21 07:38:41 +08:00
|
|
|
/**
|
|
|
|
* kzfree - like kfree but zero memory
|
|
|
|
* @p: object to free memory of
|
|
|
|
*
|
|
|
|
* The memory of the object @p points to is zeroed before freed.
|
|
|
|
* If @p is %NULL, kzfree() does nothing.
|
2009-05-31 18:50:38 +08:00
|
|
|
*
|
|
|
|
* Note: this function zeroes the whole allocated buffer which can be a good
|
|
|
|
* deal bigger than the requested buffer size passed to kmalloc(). So be
|
|
|
|
* careful when using this function in performance sensitive code.
|
2009-02-21 07:38:41 +08:00
|
|
|
*/
|
|
|
|
void kzfree(const void *p)
|
|
|
|
{
|
|
|
|
size_t ks;
|
|
|
|
void *mem = (void *)p;
|
|
|
|
|
|
|
|
if (unlikely(ZERO_OR_NULL_PTR(mem)))
|
|
|
|
return;
|
|
|
|
ks = ksize(mem);
|
|
|
|
memset(mem, 0, ks);
|
|
|
|
kfree(mem);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kzfree);
|
|
|
|
|
2006-03-24 19:18:42 +08:00
|
|
|
/*
|
|
|
|
* strndup_user - duplicate an existing string from user space
|
|
|
|
* @s: The string to duplicate
|
|
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
|
|
|
*/
|
|
|
|
char *strndup_user(const char __user *s, long n)
|
|
|
|
{
|
|
|
|
char *p;
|
|
|
|
long length;
|
|
|
|
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
|
|
|
|
if (!length)
|
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
|
|
|
if (length > n)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2010-08-10 08:18:26 +08:00
|
|
|
p = memdup_user(s, length);
|
2006-03-24 19:18:42 +08:00
|
|
|
|
2010-08-10 08:18:26 +08:00
|
|
|
if (IS_ERR(p))
|
|
|
|
return p;
|
2006-03-24 19:18:42 +08:00
|
|
|
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(strndup_user);
|
2008-07-26 10:44:36 +08:00
|
|
|
|
mm: nommu: sort mm->mmap list properly
When I was reading nommu code, I found that it handles the vma list/tree
in an unusual way. IIUC, because there can be more than one
identical/overrapped vmas in the list/tree, it sorts the tree more
strictly and does a linear search on the tree. But it doesn't applied to
the list (i.e. the list could be constructed in a different order than
the tree so that we can't use the list when finding the first vma in that
order).
Since inserting/sorting a vma in the tree and link is done at the same
time, we can easily construct both of them in the same order. And linear
searching on the tree could be more costly than doing it on the list, it
can be converted to use the list.
Also, after the commit 297c5eee3724 ("mm: make the vma list be doubly
linked") made the list be doubly linked, there were a couple of code need
to be fixed to construct the list properly.
Patch 1/6 is a preparation. It maintains the list sorted same as the tree
and construct doubly-linked list properly. Patch 2/6 is a simple
optimization for the vma deletion. Patch 3/6 and 4/6 convert tree
traversal to list traversal and the rest are simple fixes and cleanups.
This patch:
@vma added into @mm should be sorted by start addr, end addr and VMA
struct addr in that order because we may get identical VMAs in the @mm.
However this was true only for the rbtree, not for the list.
This patch fixes this by remembering 'rb_prev' during the tree traversal
like find_vma_prepare() does and linking the @vma via __vma_link_list().
After this patch, we can iterate the whole VMAs in correct order simply by
using @mm->mmap list.
[akpm@linux-foundation.org: avoid duplicating __vma_link_list()]
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-05-25 08:11:22 +08:00
|
|
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *next;
|
|
|
|
|
|
|
|
vma->vm_prev = prev;
|
|
|
|
if (prev) {
|
|
|
|
next = prev->vm_next;
|
|
|
|
prev->vm_next = vma;
|
|
|
|
} else {
|
|
|
|
mm->mmap = vma;
|
|
|
|
if (rb_parent)
|
|
|
|
next = rb_entry(rb_parent,
|
|
|
|
struct vm_area_struct, vm_rb);
|
|
|
|
else
|
|
|
|
next = NULL;
|
|
|
|
}
|
|
|
|
vma->vm_next = next;
|
|
|
|
if (next)
|
|
|
|
next->vm_prev = vma;
|
|
|
|
}
|
|
|
|
|
procfs: mark thread stack correctly in proc/<pid>/maps
Stack for a new thread is mapped by userspace code and passed via
sys_clone. This memory is currently seen as anonymous in
/proc/<pid>/maps, which makes it difficult to ascertain which mappings
are being used for thread stacks. This patch uses the individual task
stack pointers to determine which vmas are actually thread stacks.
For a multithreaded program like the following:
#include <pthread.h>
void *thread_main(void *foo)
{
while(1);
}
int main()
{
pthread_t t;
pthread_create(&t, NULL, thread_main, NULL);
pthread_join(t, NULL);
}
proc/PID/maps looks like the following:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since
the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but
that is not always a reliable way to find out which vma is a thread
stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same
content.
With this patch in place, /proc/PID/task/TID/maps are treated as 'maps
as the task would see it' and hence, only the vma that that task uses as
stack is marked as [stack]. All other 'stack' vmas are marked as
anonymous memory. /proc/PID/maps acts as a thread group level view,
where all thread stack vmas are marked as [stack:TID] where TID is the
process ID of the task that uses that vma as stack, while the process
stack is marked as [stack].
So /proc/PID/maps will look like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Thus marking all vmas that are used as stacks by the threads in the
thread group along with the process stack. The task level maps will
however like this:
00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out
00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out
019ef000-01a10000 rw-p 00000000 00:00 0 [heap]
7f8a44491000-7f8a44492000 ---p 00000000 00:00 0
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so
7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0
7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so
7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0
7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0
7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0
7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so
7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0
7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
where only the vma that is being used as a stack by *that* task is
marked as [stack].
Analogous changes have been made to /proc/PID/smaps,
/proc/PID/numa_maps, /proc/PID/task/TID/smaps and
/proc/PID/task/TID/numa_maps. Relevant snippets from smaps and
numa_maps:
[siddhesh@localhost ~ ]$ pgrep a.out
1441
[siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442]
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack"
7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack"
7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack]
[siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack"
7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2
7fff6273a000 default stack anon=3 dirty=3 N0=3
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack"
7f8a44492000 default stack anon=2 dirty=2 N0=2
[siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack"
7fff6273a000 default stack anon=3 dirty=3 N0=3
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:04 +08:00
|
|
|
/* Check if the vma is being used as a stack by this task */
|
|
|
|
static int vm_is_stack_for_task(struct task_struct *t,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the vma is being used as a stack.
|
|
|
|
* If is_group is non-zero, check in the entire thread group or else
|
|
|
|
* just check in the current task. Returns the pid of the task that
|
|
|
|
* the vma is stack for.
|
|
|
|
*/
|
|
|
|
pid_t vm_is_stack(struct task_struct *task,
|
|
|
|
struct vm_area_struct *vma, int in_group)
|
|
|
|
{
|
|
|
|
pid_t ret = 0;
|
|
|
|
|
|
|
|
if (vm_is_stack_for_task(task, vma))
|
|
|
|
return task->pid;
|
|
|
|
|
|
|
|
if (in_group) {
|
|
|
|
struct task_struct *t;
|
|
|
|
rcu_read_lock();
|
|
|
|
if (!pid_alive(task))
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
t = task;
|
|
|
|
do {
|
|
|
|
if (vm_is_stack_for_task(t, vma)) {
|
|
|
|
ret = t->pid;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
} while_each_thread(task, t);
|
|
|
|
done:
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-01-16 09:01:35 +08:00
|
|
|
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
2008-07-26 10:44:36 +08:00
|
|
|
void arch_pick_mmap_layout(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
|
|
|
mm->get_unmapped_area = arch_get_unmapped_area;
|
|
|
|
}
|
|
|
|
#endif
|
2008-08-13 06:52:52 +08:00
|
|
|
|
2010-08-22 19:08:57 +08:00
|
|
|
/*
|
|
|
|
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
|
|
|
|
* back to the regular GUP.
|
2011-03-31 09:57:33 +08:00
|
|
|
* If the architecture not support this function, simply return with no
|
2010-08-22 19:08:57 +08:00
|
|
|
* page pinned
|
|
|
|
*/
|
2014-04-08 06:37:26 +08:00
|
|
|
int __weak __get_user_pages_fast(unsigned long start,
|
2010-08-22 19:08:57 +08:00
|
|
|
int nr_pages, int write, struct page **pages)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
|
|
|
|
|
2009-04-14 05:40:05 +08:00
|
|
|
/**
|
|
|
|
* get_user_pages_fast() - pin user pages in memory
|
|
|
|
* @start: starting user address
|
|
|
|
* @nr_pages: number of pages from start to pin
|
|
|
|
* @write: whether pages will be written to
|
|
|
|
* @pages: array that receives pointers to the pages pinned.
|
|
|
|
* Should be at least nr_pages long.
|
|
|
|
*
|
|
|
|
* Returns number of pages pinned. This may be fewer than the number
|
|
|
|
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
|
|
|
* were pinned, returns -errno.
|
2009-06-17 06:31:39 +08:00
|
|
|
*
|
|
|
|
* get_user_pages_fast provides equivalent functionality to get_user_pages,
|
|
|
|
* operating on current and current->mm, with force=0 and vma=NULL. However
|
|
|
|
* unlike get_user_pages, it must be called without mmap_sem held.
|
|
|
|
*
|
|
|
|
* get_user_pages_fast may take mmap_sem and page table locks, so no
|
|
|
|
* assumptions can be made about lack of locking. get_user_pages_fast is to be
|
|
|
|
* implemented in a way that is advantageous (vs get_user_pages()) when the
|
|
|
|
* user memory area is already faulted in and present in ptes. However if the
|
|
|
|
* pages have to be faulted in, it may turn out to be slightly slower so
|
|
|
|
* callers need to carefully consider what to use. On many architectures,
|
|
|
|
* get_user_pages_fast simply falls back to get_user_pages.
|
2009-04-14 05:40:05 +08:00
|
|
|
*/
|
2014-04-08 06:37:26 +08:00
|
|
|
int __weak get_user_pages_fast(unsigned long start,
|
2008-08-13 06:52:52 +08:00
|
|
|
int nr_pages, int write, struct page **pages)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
ret = get_user_pages(current, mm, start, nr_pages,
|
|
|
|
write, 0, pages, NULL);
|
|
|
|
up_read(&mm->mmap_sem);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
2009-03-23 21:12:24 +08:00
|
|
|
|
2012-05-31 08:17:35 +08:00
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long pgoff)
|
|
|
|
{
|
|
|
|
unsigned long ret;
|
|
|
|
struct mm_struct *mm = current->mm;
|
2013-02-23 08:32:47 +08:00
|
|
|
unsigned long populate;
|
2012-05-31 08:17:35 +08:00
|
|
|
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
|
|
|
if (!ret) {
|
|
|
|
down_write(&mm->mmap_sem);
|
2013-02-23 08:32:37 +08:00
|
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
|
|
|
|
&populate);
|
2012-05-31 08:17:35 +08:00
|
|
|
up_write(&mm->mmap_sem);
|
2013-02-23 08:32:47 +08:00
|
|
|
if (populate)
|
|
|
|
mm_populate(ret, populate);
|
2012-05-31 08:17:35 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
|
|
return -EINVAL;
|
|
|
|
if (unlikely(offset & ~PAGE_MASK))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
|
2013-02-23 08:34:35 +08:00
|
|
|
struct address_space *page_mapping(struct page *page)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = page->mapping;
|
|
|
|
|
2014-01-15 09:56:40 +08:00
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
|
|
|
if (unlikely(PageSlab(page)))
|
|
|
|
return NULL;
|
|
|
|
|
2013-02-23 08:34:37 +08:00
|
|
|
if (unlikely(PageSwapCache(page))) {
|
|
|
|
swp_entry_t entry;
|
|
|
|
|
|
|
|
entry.val = page_private(page);
|
|
|
|
mapping = swap_address_space(entry);
|
2013-09-12 05:21:29 +08:00
|
|
|
} else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
|
2013-02-23 08:34:35 +08:00
|
|
|
mapping = NULL;
|
|
|
|
return mapping;
|
|
|
|
}
|
|
|
|
|
2014-01-22 07:49:14 +08:00
|
|
|
int overcommit_ratio_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_kbytes = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int overcommit_kbytes_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
|
|
if (ret == 0 && write)
|
|
|
|
sysctl_overcommit_ratio = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-13 07:08:31 +08:00
|
|
|
/*
|
|
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
|
|
*/
|
|
|
|
unsigned long vm_commit_limit(void)
|
|
|
|
{
|
2014-01-22 07:49:14 +08:00
|
|
|
unsigned long allowed;
|
|
|
|
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
else
|
|
|
|
allowed = ((totalram_pages - hugetlb_total_pages())
|
|
|
|
* sysctl_overcommit_ratio / 100);
|
|
|
|
allowed += total_swap_pages;
|
|
|
|
|
|
|
|
return allowed;
|
2013-11-13 07:08:31 +08:00
|
|
|
}
|
|
|
|
|
2014-02-12 02:11:59 +08:00
|
|
|
/**
|
|
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
|
|
* @task: the task whose cmdline value to copy.
|
|
|
|
* @buffer: the buffer to copy to.
|
|
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
|
|
* to this length.
|
|
|
|
* Returns the size of the cmdline field copied. Note that the copy does
|
|
|
|
* not guarantee an ending NULL byte.
|
|
|
|
*/
|
|
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
|
|
{
|
|
|
|
int res = 0;
|
|
|
|
unsigned int len;
|
|
|
|
struct mm_struct *mm = get_task_mm(task);
|
|
|
|
if (!mm)
|
|
|
|
goto out;
|
|
|
|
if (!mm->arg_end)
|
|
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
|
|
|
|
len = mm->arg_end - mm->arg_start;
|
|
|
|
|
|
|
|
if (len > buflen)
|
|
|
|
len = buflen;
|
|
|
|
|
|
|
|
res = access_process_vm(task, mm->arg_start, buffer, len, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the nul at the end of args has been overwritten, then
|
|
|
|
* assume application is using setproctitle(3).
|
|
|
|
*/
|
|
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
|
|
len = strnlen(buffer, res);
|
|
|
|
if (len < res) {
|
|
|
|
res = len;
|
|
|
|
} else {
|
|
|
|
len = mm->env_end - mm->env_start;
|
|
|
|
if (len > buflen - res)
|
|
|
|
len = buflen - res;
|
|
|
|
res += access_process_vm(task, mm->env_start,
|
|
|
|
buffer+res, len, 0);
|
|
|
|
res = strnlen(buffer, res);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out_mm:
|
|
|
|
mmput(mm);
|
|
|
|
out:
|
|
|
|
return res;
|
|
|
|
}
|
2013-11-13 07:08:31 +08:00
|
|
|
|
2009-03-23 21:12:24 +08:00
|
|
|
/* Tracepoints definitions. */
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kfree);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
|