linux/arch/x86_64/kernel/sys_x86_64.c

/*
 * linux/arch/x86_64/kernel/sys_x86_64.c
 */

#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>

#include <asm/uaccess.h>
#include <asm/ia32.h>

/*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
asmlinkage long sys_pipe(int __user *fildes)
{
	int fd[2];
	int error;

	error = do_pipe(fd);
	if (!error) {
		if (copy_to_user(fildes, fd, 2*sizeof(int)))
			error = -EFAULT;
	}
	return error;
}

asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
	unsigned long fd, unsigned long off)
{
	long error;
	struct file * file;

	error = -EINVAL;
	if (off & ~PAGE_MASK)
		goto out;

	error = -EBADF;
	file = NULL;
	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
	if (!(flags & MAP_ANONYMOUS)) {
		file = fget(fd);
		if (!file)
			goto out;
	}
	down_write(&current->mm->mmap_sem);
	error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
	up_write(&current->mm->mmap_sem);

	if (file)
		fput(file);
out:
	return error;
}

static void find_start_end(unsigned long flags, unsigned long *begin,
			   unsigned long *end)
{
	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
		/* This is usually used needed to map code in small
		   model, so it needs to be in the first 31bit. Limit
		   it to that.  This means we need to move the
		   unmapped base down for this case. This can give
		   conflicts with the heap, but we assume that glibc
		   malloc knows how to fall back to mmap. Give it 1GB
		   of playground for now. -AK */ 
		*begin = 0x40000000; 
		*end = 0x80000000;		
	} else {
		*begin = TASK_UNMAPPED_BASE;
		*end = TASK_SIZE; 
	}
} 

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long start_addr;
	unsigned long begin, end;
	
	find_start_end(flags, &begin, &end); 

	if (len > end)
		return -ENOMEM;

	if (addr) {
		addr = PAGE_ALIGN(addr);
		vma = find_vma(mm, addr);
		if (end - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
	}
	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
	    && len <= mm->cached_hole_size) {
	        mm->cached_hole_size = 0;
		mm->free_area_cache = begin;
	}
	addr = mm->free_area_cache;
	if (addr < begin) 
		addr = begin; 
	start_addr = addr;

full_search:
	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
		/* At this point:  (!vma || addr < vma->vm_end). */
		if (end - len < addr) {
			/*
			 * Start a new search - just in case we missed
			 * some holes.
			 */
			if (start_addr != begin) {
				start_addr = addr = begin;
				mm->cached_hole_size = 0;
				goto full_search;
			}
			return -ENOMEM;
		}
		if (!vma || addr + len <= vma->vm_start) {
			/*
			 * Remember the place where we stopped the search:
			 */
			mm->free_area_cache = addr + len;
			return addr;
		}
		if (addr + mm->cached_hole_size < vma->vm_start)
		        mm->cached_hole_size = vma->vm_start - addr;

		addr = vma->vm_end;
	}
}

asmlinkage long sys_uname(struct new_utsname __user * name)
{
	int err;
	down_read(&uts_sem);
	err = copy_to_user(name, &system_utsname, sizeof (*name));
	up_read(&uts_sem);
	if (personality(current->personality) == PER_LINUX32) 
		err |= copy_to_user(&name->machine, "i686", 5); 		
	return err ? -EFAULT : 0;
}

asmlinkage long sys_time64(long __user * tloc)
{
	struct timeval now; 
	int i; 

	do_gettimeofday(&now);
	i = now.tv_sec;
	if (tloc) {
		if (put_user(i,tloc))
			i = -EFAULT;
	}
	return i;
}
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/*`
			`* linux/arch/x86_64/kernel/sys_x86_64.c`
			`*/`

			`#include <linux/errno.h>`
			`#include <linux/sched.h>`
			`#include <linux/syscalls.h>`
			`#include <linux/mm.h>`
			`#include <linux/smp.h>`
			`#include <linux/smp_lock.h>`
			`#include <linux/sem.h>`
			`#include <linux/msg.h>`
			`#include <linux/shm.h>`
			`#include <linux/stat.h>`
			`#include <linux/mman.h>`
			`#include <linux/file.h>`
			`#include <linux/utsname.h>`
			`#include <linux/personality.h>`

			`#include <asm/uaccess.h>`
			`#include <asm/ia32.h>`

			`/*`
			`* sys_pipe() is the normal C calling standard for creating`
			`* a pipe. It's not the way Unix traditionally does this, though.`
			`*/`
			`asmlinkage long sys_pipe(int __user *fildes)`
			`{`
			`int fd[2];`
			`int error;`

			`error = do_pipe(fd);`
			`if (!error) {`
			`if (copy_to_user(fildes, fd, 2*sizeof(int)))`
			`error = -EFAULT;`
			`}`
			`return error;`
			`}`

			`asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,`
			`unsigned long fd, unsigned long off)`
			`{`
			`long error;`
			`struct file * file;`

			`error = -EINVAL;`
			`if (off & ~PAGE_MASK)`
			`goto out;`

			`error = -EBADF;`
			`file = NULL;`
			`flags &= ~(MAP_EXECUTABLE \| MAP_DENYWRITE);`
			`if (!(flags & MAP_ANONYMOUS)) {`
			`file = fget(fd);`
			`if (!file)`
			`goto out;`
			`}`
			`down_write(&current->mm->mmap_sem);`
			`error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);`
			`up_write(&current->mm->mmap_sem);`

			`if (file)`
			`fput(file);`
			`out:`
			`return error;`
			`}`

			`static void find_start_end(unsigned long flags, unsigned long *begin,`
			`unsigned long *end)`
			`{`
[PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes Appended patch will setup compatibility mode TASK_SIZE properly. This will fix atleast three known bugs that can be encountered while running compatibility mode apps. a) A malicious 32bit app can have an elf section at 0xffffe000. During exec of this app, we will have a memory leak as insert_vm_struct() is not checking for return value in syscall32_setup_pages() and thus not freeing the vma allocated for the vsyscall page. And instead of exec failing (as it has addresses > TASK_SIZE), we were allowing it to succeed previously. b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area may return addresses beyond 32bits, ultimately causing corruption because of wrap-around and resulting in SEGFAULT, instead of returning ENOMEM. c) 32bit app doing this below mmap will now fail. mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ\|PROT_WRITE, MAP_FIXED\|MAP_PRIVATE\|MAP_ANON, 0, 0); Signed-off-by: Zou Nan hai <nanhai.zou@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:32 +08:00			`if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/* This is usually used needed to map code in small`
			`model, so it needs to be in the first 31bit. Limit`
			`it to that. This means we need to move the`
			`unmapped base down for this case. This can give`
			`conflicts with the heap, but we assume that glibc`
			`malloc knows how to fall back to mmap. Give it 1GB`
			`of playground for now. -AK */`
			`*begin = 0x40000000;`
			`*end = 0x80000000;`
[PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes Appended patch will setup compatibility mode TASK_SIZE properly. This will fix atleast three known bugs that can be encountered while running compatibility mode apps. a) A malicious 32bit app can have an elf section at 0xffffe000. During exec of this app, we will have a memory leak as insert_vm_struct() is not checking for return value in syscall32_setup_pages() and thus not freeing the vma allocated for the vsyscall page. And instead of exec failing (as it has addresses > TASK_SIZE), we were allowing it to succeed previously. b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area may return addresses beyond 32bits, ultimately causing corruption because of wrap-around and resulting in SEGFAULT, instead of returning ENOMEM. c) 32bit app doing this below mmap will now fail. mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ\|PROT_WRITE, MAP_FIXED\|MAP_PRIVATE\|MAP_ANON, 0, 0); Signed-off-by: Zou Nan hai <nanhai.zou@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:32 +08:00			`} else {`
			`*begin = TASK_UNMAPPED_BASE;`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`*end = TASK_SIZE;`
[PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes Appended patch will setup compatibility mode TASK_SIZE properly. This will fix atleast three known bugs that can be encountered while running compatibility mode apps. a) A malicious 32bit app can have an elf section at 0xffffe000. During exec of this app, we will have a memory leak as insert_vm_struct() is not checking for return value in syscall32_setup_pages() and thus not freeing the vma allocated for the vsyscall page. And instead of exec failing (as it has addresses > TASK_SIZE), we were allowing it to succeed previously. b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area may return addresses beyond 32bits, ultimately causing corruption because of wrap-around and resulting in SEGFAULT, instead of returning ENOMEM. c) 32bit app doing this below mmap will now fail. mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ\|PROT_WRITE, MAP_FIXED\|MAP_PRIVATE\|MAP_ANON, 0, 0); Signed-off-by: Zou Nan hai <nanhai.zou@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:32 +08:00			`}`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`}`

			`unsigned long`
			`arch_get_unmapped_area(struct file *filp, unsigned long addr,`
			`unsigned long len, unsigned long pgoff, unsigned long flags)`
			`{`
			`struct mm_struct *mm = current->mm;`
			`struct vm_area_struct *vma;`
			`unsigned long start_addr;`
			`unsigned long begin, end;`

			`find_start_end(flags, &begin, &end);`

			`if (len > end)`
			`return -ENOMEM;`

			`if (addr) {`
			`addr = PAGE_ALIGN(addr);`
			`vma = find_vma(mm, addr);`
			`if (end - len >= addr &&`
			`(!vma \|\| addr + len <= vma->vm_start))`
			`return addr;`
			`}`
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:49 +08:00			`if (((flags & MAP_32BIT) \|\| test_thread_flag(TIF_IA32))`
			`&& len <= mm->cached_hole_size) {`
			`mm->cached_hole_size = 0;`
			`mm->free_area_cache = begin;`
			`}`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`addr = mm->free_area_cache;`
			`if (addr < begin)`
			`addr = begin;`
			`start_addr = addr;`

			`full_search:`
			`for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {`
			`/* At this point: (!vma \|\| addr < vma->vm_end). */`
			`if (end - len < addr) {`
			`/*`
			`* Start a new search - just in case we missed`
			`* some holes.`
			`*/`
			`if (start_addr != begin) {`
			`start_addr = addr = begin;`
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:49 +08:00			`mm->cached_hole_size = 0;`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`goto full_search;`
			`}`
			`return -ENOMEM;`
			`}`
			`if (!vma \|\| addr + len <= vma->vm_start) {`
			`/*`
			`* Remember the place where we stopped the search:`
			`*/`
			`mm->free_area_cache = addr + len;`
			`return addr;`
			`}`
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2005-06-22 08:14:49 +08:00			`if (addr + mm->cached_hole_size < vma->vm_start)`
			`mm->cached_hole_size = vma->vm_start - addr;`

Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`addr = vma->vm_end;`
			`}`
			`}`

			`asmlinkage long sys_uname(struct new_utsname __user * name)`
			`{`
			`int err;`
			`down_read(&uts_sem);`
			`err = copy_to_user(name, &system_utsname, sizeof (*name));`
			`up_read(&uts_sem);`
			`if (personality(current->personality) == PER_LINUX32)`
			`err \|= copy_to_user(&name->machine, "i686", 5);`
			`return err ? -EFAULT : 0;`
			`}`

			`asmlinkage long sys_time64(long __user * tloc)`
			`{`
			`struct timeval now;`
			`int i;`

			`do_gettimeofday(&now);`
			`i = now.tv_sec;`
			`if (tloc) {`
			`if (put_user(i,tloc))`
			`i = -EFAULT;`
			`}`
			`return i;`
			`}`