linux/arch/sparc/kernel/sys_sparc_64.c

650 lines
16 KiB
C
Raw Normal View History

/* linux/arch/sparc64/kernel/sys_sparc.c
*
* This file contains various random system calls that
* have a non-standard calling sequence on the Linux/sparc
* platform.
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/sched/debug.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/utsname.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/ipc.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/uaccess.h>
#include <asm/utrap.h>
[PATCH] provide kernel_execve on all architectures This adds the new kernel_execve function on all architectures that were using _syscall3() to implement execve. The implementation uses code from the _syscall3 macros provided in the unistd.h header file. I don't have cross-compilers for any of these architectures, so the patch is untested with the exception of i386. Most architectures can probably implement this in a nicer way in assembly or by combining it with the sys_execve implementation itself, but this should do it for now. [bunk@stusta.de: m68knommu build fix] [markh@osdl.org: build fix] [bero@arklinux.org: build fix] [ralf@linux-mips.org: mips fix] [schwidefsky@de.ibm.com: s390 fix] Signed-off-by: Arnd Bergmann <arnd@arndb.de> Cc: Andi Kleen <ak@muc.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Hirokazu Takata <takata.hirokazu@renesas.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Chris Zankel <chris@zankel.net> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Mark Haverkamp <markh@osdl.org> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:18:34 +08:00
#include <asm/unistd.h>
#include "entry.h"
#include "kernel.h"
#include "systbls.h"
/* #define DEBUG_UNIMP_SYSCALL */
asmlinkage unsigned long sys_getpagesize(void)
{
return PAGE_SIZE;
}
/* Does addr --> addr+len fall within 4GB of the VA-space hole or
* overflow past the end of the 64-bit address space?
*/
static inline int invalid_64bit_range(unsigned long addr, unsigned long len)
{
unsigned long va_exclude_start, va_exclude_end;
va_exclude_start = VA_EXCLUDE_START;
va_exclude_end = VA_EXCLUDE_END;
if (unlikely(len >= va_exclude_start))
return 1;
if (unlikely((addr + len) < addr))
return 1;
if (unlikely((addr >= va_exclude_start && addr < va_exclude_end) ||
((addr + len) >= va_exclude_start &&
(addr + len) < va_exclude_end)))
return 1;
return 0;
}
/* These functions differ from the default implementations in
* mm/mmap.c in two ways:
*
* 1) For file backed MAP_SHARED mmap()'s we D-cache color align,
* for fixed such mappings we just validate what the user gave us.
* 2) For 64-bit tasks we avoid mapping anything within 4GB of
* the spitfire/niagara VA-hole.
*/
static inline unsigned long COLOR_ALIGN(unsigned long addr,
unsigned long pgoff)
{
unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1);
unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
return base + off;
}
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct * vma;
unsigned long task_size = TASK_SIZE;
int do_color_align;
struct vm_unmapped_area_info info;
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
if ((flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
return -EINVAL;
return addr;
}
if (test_thread_flag(TIF_32BIT))
task_size = STACK_TOP32;
if (unlikely(len > task_size || len >= VA_EXCLUDE_START))
return -ENOMEM;
do_color_align = 0;
if (filp || (flags & MAP_SHARED))
do_color_align = 1;
if (addr) {
if (do_color_align)
addr = COLOR_ALIGN(addr, pgoff);
else
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (task_size - len >= addr &&
mm: larger stack guard gap, between vmas Stack guard page is a useful feature to reduce a risk of stack smashing into a different mapping. We have been using a single page gap which is sufficient to prevent having stack adjacent to a different mapping. But this seems to be insufficient in the light of the stack usage in userspace. E.g. glibc uses as large as 64kB alloca() in many commonly used functions. Others use constructs liks gid_t buffer[NGROUPS_MAX] which is 256kB or stack strings with MAX_ARG_STRLEN. This will become especially dangerous for suid binaries and the default no limit for the stack size limit because those applications can be tricked to consume a large portion of the stack and a single glibc call could jump over the guard page. These attacks are not theoretical, unfortunatelly. Make those attacks less probable by increasing the stack guard gap to 1MB (on systems with 4k pages; but make it depend on the page size because systems with larger base pages might cap stack allocations in the PAGE_SIZE units) which should cover larger alloca() and VLA stack allocations. It is obviously not a full fix because the problem is somehow inherent, but it should reduce attack space a lot. One could argue that the gap size should be configurable from userspace, but that can be done later when somebody finds that the new 1MB is wrong for some special case applications. For now, add a kernel command line option (stack_guard_gap) to specify the stack gap size (in page units). Implementation wise, first delete all the old code for stack guard page: because although we could get away with accounting one extra page in a stack vma, accounting a larger gap can break userspace - case in point, a program run with "ulimit -S -v 20000" failed when the 1MB gap was counted for RLIMIT_AS; similar problems could come with RLIMIT_MLOCK and strict non-overcommit mode. Instead of keeping gap inside the stack vma, maintain the stack guard gap as a gap between vmas: using vm_start_gap() in place of vm_start (or vm_end_gap() in place of vm_end if VM_GROWSUP) in just those few places which need to respect the gap - mainly arch_get_unmapped_area(), and and the vma tree's subtree_gap support for that. Original-patch-by: Oleg Nesterov <oleg@redhat.com> Original-patch-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Tested-by: Helge Deller <deller@gmx.de> # parisc Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-06-19 19:03:24 +08:00
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = min(task_size, VA_EXCLUDE_START);
info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
VM_BUG_ON(addr != -ENOMEM);
info.low_limit = VA_EXCLUDE_END;
info.high_limit = task_size;
addr = vm_unmapped_area(&info);
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-22 08:14:49 +08:00
}
return addr;
}
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long task_size = STACK_TOP32;
unsigned long addr = addr0;
int do_color_align;
struct vm_unmapped_area_info info;
/* This should only ever run for 32-bit processes. */
BUG_ON(!test_thread_flag(TIF_32BIT));
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
if ((flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
return -EINVAL;
return addr;
}
if (unlikely(len > task_size))
return -ENOMEM;
do_color_align = 0;
if (filp || (flags & MAP_SHARED))
do_color_align = 1;
/* requesting a specific address */
if (addr) {
if (do_color_align)
addr = COLOR_ALIGN(addr, pgoff);
else
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (task_size - len >= addr &&
mm: larger stack guard gap, between vmas Stack guard page is a useful feature to reduce a risk of stack smashing into a different mapping. We have been using a single page gap which is sufficient to prevent having stack adjacent to a different mapping. But this seems to be insufficient in the light of the stack usage in userspace. E.g. glibc uses as large as 64kB alloca() in many commonly used functions. Others use constructs liks gid_t buffer[NGROUPS_MAX] which is 256kB or stack strings with MAX_ARG_STRLEN. This will become especially dangerous for suid binaries and the default no limit for the stack size limit because those applications can be tricked to consume a large portion of the stack and a single glibc call could jump over the guard page. These attacks are not theoretical, unfortunatelly. Make those attacks less probable by increasing the stack guard gap to 1MB (on systems with 4k pages; but make it depend on the page size because systems with larger base pages might cap stack allocations in the PAGE_SIZE units) which should cover larger alloca() and VLA stack allocations. It is obviously not a full fix because the problem is somehow inherent, but it should reduce attack space a lot. One could argue that the gap size should be configurable from userspace, but that can be done later when somebody finds that the new 1MB is wrong for some special case applications. For now, add a kernel command line option (stack_guard_gap) to specify the stack gap size (in page units). Implementation wise, first delete all the old code for stack guard page: because although we could get away with accounting one extra page in a stack vma, accounting a larger gap can break userspace - case in point, a program run with "ulimit -S -v 20000" failed when the 1MB gap was counted for RLIMIT_AS; similar problems could come with RLIMIT_MLOCK and strict non-overcommit mode. Instead of keeping gap inside the stack vma, maintain the stack guard gap as a gap between vmas: using vm_start_gap() in place of vm_start (or vm_end_gap() in place of vm_end if VM_GROWSUP) in just those few places which need to respect the gap - mainly arch_get_unmapped_area(), and and the vma tree's subtree_gap support for that. Original-patch-by: Oleg Nesterov <oleg@redhat.com> Original-patch-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Tested-by: Helge Deller <deller@gmx.de> # parisc Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-06-19 19:03:24 +08:00
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
if (addr & ~PAGE_MASK) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = STACK_TOP32;
addr = vm_unmapped_area(&info);
}
return addr;
}
/* Try to align mapping such that we align it as much as possible. */
unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{
unsigned long align_goal, addr = -ENOMEM;
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
get_area = current->mm->get_unmapped_area;
if (flags & MAP_FIXED) {
/* Ok, don't mess with it. */
return get_area(NULL, orig_addr, len, pgoff, flags);
}
flags &= ~MAP_SHARED;
align_goal = PAGE_SIZE;
if (len >= (4UL * 1024 * 1024))
align_goal = (4UL * 1024 * 1024);
else if (len >= (512UL * 1024))
align_goal = (512UL * 1024);
else if (len >= (64UL * 1024))
align_goal = (64UL * 1024);
do {
addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags);
if (!(addr & ~PAGE_MASK)) {
addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL);
break;
}
if (align_goal == (4UL * 1024 * 1024))
align_goal = (512UL * 1024);
else if (align_goal == (512UL * 1024))
align_goal = (64UL * 1024);
else
align_goal = PAGE_SIZE;
} while ((addr & ~PAGE_MASK) && align_goal > PAGE_SIZE);
/* Mapping is smaller than 64K or larger areas could not
* be obtained.
*/
if (addr & ~PAGE_MASK)
addr = get_area(NULL, orig_addr, len, pgoff, flags);
return addr;
}
EXPORT_SYMBOL(get_fb_unmapped_area);
/* Essentially the same as PowerPC. */
static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0UL;
if (current->flags & PF_RANDOMIZE) {
unsigned long val = get_random_long();
if (test_thread_flag(TIF_32BIT))
rnd = (val % (1UL << (23UL-PAGE_SHIFT)));
else
rnd = (val % (1UL << (30UL-PAGE_SHIFT)));
}
return rnd << PAGE_SHIFT;
}
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = mmap_rnd();
unsigned long gap;
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
gap = rlimit(RLIMIT_STACK);
if (!test_thread_flag(TIF_32BIT) ||
(current->personality & ADDR_COMPAT_LAYOUT) ||
gap == RLIM_INFINITY ||
sysctl_legacy_va_layout) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
/* We know it's 32-bit */
unsigned long task_size = STACK_TOP32;
if (gap < 128 * 1024 * 1024)
gap = 128 * 1024 * 1024;
if (gap > (task_size / 6 * 5))
gap = (task_size / 6 * 5);
mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
/*
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way unix traditionally does this, though.
*/
SYSCALL_DEFINE1(sparc_pipe_real, struct pt_regs *, regs)
{
int fd[2];
int error;
flag parameters: pipe This patch introduces the new syscall pipe2 which is like pipe but it also takes an additional parameter which takes a flag value. This patch implements the handling of O_CLOEXEC for the flag. I did not add support for the new syscall for the architectures which have a special sys_pipe implementation. I think the maintainers of those archs have the chance to go with the unified implementation but that's up to them. The implementation introduces do_pipe_flags. I did that instead of changing all callers of do_pipe because some of the callers are written in assembler. I would probably screw up changing the assembly code. To avoid breaking code do_pipe is now a small wrapper around do_pipe_flags. Once all callers are changed over to do_pipe_flags the old do_pipe function can be removed. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include <fcntl.h> #include <stdio.h> #include <unistd.h> #include <sys/syscall.h> #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fd[2]; if (syscall (__NR_pipe2, fd, 0) != 0) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { printf ("pipe2(0) set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0) { puts ("pipe2(O_CLOEXEC) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper <drepper@redhat.com> Acked-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@googlemail.com> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:29:30 +08:00
error = do_pipe_flags(fd, 0);
if (error)
goto out;
regs->u_regs[UREG_I1] = fd[1];
error = fd[0];
out:
return error;
}
/*
* sys_ipc() is the de-multiplexer for the SysV IPC calls..
*
* This is really horribly ugly.
*/
Add generic sys_ipc wrapper Add a generic implementation of the ipc demultiplexer syscall. Except for s390 and sparc64 all implementations of the sys_ipc are nearly identical. There are slight differences in the types of the parameters, where mips and powerpc as the only 64-bit architectures with sys_ipc use unsigned long for the "third" argument as it gets casted to a pointer later, while it traditionally is an "int" like most other paramters. frv goes even further and uses unsigned long for all parameters execept for "ptr" which is a pointer type everywhere. The change from int to unsigned long for "third" and back to "int" for the others on frv should be fine due to the in-register calling conventions for syscalls (we already had a similar issue with the generic sys_ptrace), but I'd prefer to have the arch maintainers looks over this in details. Except for that h8300, m68k and m68knommu lack an impplementation of the semtimedop sub call which this patch adds, and various architectures have gets used - at least on i386 it seems superflous as the compat code on x86-64 and ia64 doesn't even bother to implement it. [akpm@linux-foundation.org: add sys_ipc to sys_ni.c] Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Reviewed-by: H. Peter Anvin <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: James Morris <jmorris@namei.org> Cc: Andreas Schwab <schwab@linux-m68k.org> Acked-by: Jesper Nilsson <jesper.nilsson@axis.com> Acked-by: Russell King <rmk+kernel@arm.linux.org.uk> Acked-by: David Howells <dhowells@redhat.com> Acked-by: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-11 07:21:18 +08:00
SYSCALL_DEFINE6(sparc_ipc, unsigned int, call, int, first, unsigned long, second,
unsigned long, third, void __user *, ptr, long, fifth)
{
long err;
/* No need for backward compatibility. We can start fresh... */
if (call <= SEMTIMEDOP) {
switch (call) {
case SEMOP:
err = sys_semtimedop(first, ptr,
(unsigned int)second, NULL);
goto out;
case SEMTIMEDOP:
err = sys_semtimedop(first, ptr, (unsigned int)second,
(const struct timespec __user *)
(unsigned long) fifth);
goto out;
case SEMGET:
err = sys_semget(first, (int)second, (int)third);
goto out;
case SEMCTL: {
err = sys_semctl(first, second,
(int)third | IPC_64,
(unsigned long) ptr);
goto out;
}
default:
err = -ENOSYS;
goto out;
}
}
if (call <= MSGCTL) {
switch (call) {
case MSGSND:
err = sys_msgsnd(first, ptr, (size_t)second,
(int)third);
goto out;
case MSGRCV:
err = sys_msgrcv(first, ptr, (size_t)second, fifth,
(int)third);
goto out;
case MSGGET:
err = sys_msgget((key_t)first, (int)second);
goto out;
case MSGCTL:
err = sys_msgctl(first, (int)second | IPC_64, ptr);
goto out;
default:
err = -ENOSYS;
goto out;
}
}
if (call <= SHMCTL) {
switch (call) {
case SHMAT: {
ulong raddr;
err = do_shmat(first, ptr, (int)second, &raddr, SHMLBA);
if (!err) {
if (put_user(raddr,
(ulong __user *) third))
err = -EFAULT;
}
goto out;
}
case SHMDT:
err = sys_shmdt(ptr);
goto out;
case SHMGET:
err = sys_shmget(first, (size_t)second, (int)third);
goto out;
case SHMCTL:
err = sys_shmctl(first, (int)second | IPC_64, ptr);
goto out;
default:
err = -ENOSYS;
goto out;
}
} else {
err = -ENOSYS;
}
out:
return err;
}
SYSCALL_DEFINE1(sparc64_personality, unsigned long, personality)
{
long ret;
if (personality(current->personality) == PER_LINUX32 &&
personality(personality) == PER_LINUX)
personality |= PER_LINUX32;
ret = sys_personality(personality);
if (personality(ret) == PER_LINUX32)
ret &= ~PER_LINUX32;
return ret;
}
int sparc_mmap_check(unsigned long addr, unsigned long len)
{
if (test_thread_flag(TIF_32BIT)) {
if (len >= STACK_TOP32)
return -EINVAL;
if (addr > STACK_TOP32 - len)
return -EINVAL;
} else {
if (len >= VA_EXCLUDE_START)
return -EINVAL;
if (invalid_64bit_range(addr, len))
return -EINVAL;
}
return 0;
}
/* Linux version of mmap */
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags, unsigned long, fd,
unsigned long, off)
{
unsigned long retval = -EINVAL;
if ((off + PAGE_ALIGN(len)) < off)
goto out;
if (off & ~PAGE_MASK)
goto out;
retval = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
out:
return retval;
}
SYSCALL_DEFINE2(64_munmap, unsigned long, addr, size_t, len)
{
if (invalid_64bit_range(addr, len))
return -EINVAL;
return vm_munmap(addr, len);
}
SYSCALL_DEFINE5(64_mremap, unsigned long, addr, unsigned long, old_len,
unsigned long, new_len, unsigned long, flags,
unsigned long, new_addr)
{
if (test_thread_flag(TIF_32BIT))
return -EINVAL;
return sys_mremap(addr, old_len, new_len, flags, new_addr);
}
/* we come to here via sys_nis_syscall so it can setup the regs argument */
asmlinkage unsigned long c_sys_nis_syscall(struct pt_regs *regs)
{
static int count;
/* Don't make the system unusable, if someone goes stuck */
if (count++ > 5)
return -ENOSYS;
printk ("Unimplemented SPARC system call %ld\n",regs->u_regs[1]);
#ifdef DEBUG_UNIMP_SYSCALL
show_regs (regs);
#endif
return -ENOSYS;
}
/* #define DEBUG_SPARC_BREAKPOINT */
asmlinkage void sparc_breakpoint(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
siginfo_t info;
if (test_thread_flag(TIF_32BIT)) {
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Entering kernel PC=%lx, nPC=%lx\n", regs->tpc, regs->tnpc);
#endif
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_BRKPT;
info.si_addr = (void __user *)regs->tpc;
info.si_trapno = 0;
force_sig_info(SIGTRAP, &info, current);
#ifdef DEBUG_SPARC_BREAKPOINT
printk ("TRAP: Returning to space: PC=%lx nPC=%lx\n", regs->tpc, regs->tnpc);
#endif
exception_exit(prev_state);
}
extern void check_pending(int signum);
SYSCALL_DEFINE2(getdomainname, char __user *, name, int, len)
{
int nlen, err;
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
nlen = strlen(utsname()->domainname) + 1;
err = -EINVAL;
if (nlen > len)
goto out;
err = -EFAULT;
if (!copy_to_user(name, utsname()->domainname, nlen))
err = 0;
out:
up_read(&uts_sem);
return err;
}
SYSCALL_DEFINE5(utrap_install, utrap_entry_t, type,
utrap_handler_t, new_p, utrap_handler_t, new_d,
utrap_handler_t __user *, old_p,
utrap_handler_t __user *, old_d)
{
if (type < UT_INSTRUCTION_EXCEPTION || type > UT_TRAP_INSTRUCTION_31)
return -EINVAL;
if (new_p == (utrap_handler_t)(long)UTH_NOCHANGE) {
if (old_p) {
if (!current_thread_info()->utraps) {
if (put_user(NULL, old_p))
return -EFAULT;
} else {
if (put_user((utrap_handler_t)(current_thread_info()->utraps[type]), old_p))
return -EFAULT;
}
}
if (old_d) {
if (put_user(NULL, old_d))
return -EFAULT;
}
return 0;
}
if (!current_thread_info()->utraps) {
current_thread_info()->utraps =
kzalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long), GFP_KERNEL);
if (!current_thread_info()->utraps)
return -ENOMEM;
current_thread_info()->utraps[0] = 1;
} else {
if ((utrap_handler_t)current_thread_info()->utraps[type] != new_p &&
current_thread_info()->utraps[0] > 1) {
unsigned long *p = current_thread_info()->utraps;
current_thread_info()->utraps =
kmalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long),
GFP_KERNEL);
if (!current_thread_info()->utraps) {
current_thread_info()->utraps = p;
return -ENOMEM;
}
p[0]--;
current_thread_info()->utraps[0] = 1;
memcpy(current_thread_info()->utraps+1, p+1,
UT_TRAP_INSTRUCTION_31*sizeof(long));
}
}
if (old_p) {
if (put_user((utrap_handler_t)(current_thread_info()->utraps[type]), old_p))
return -EFAULT;
}
if (old_d) {
if (put_user(NULL, old_d))
return -EFAULT;
}
current_thread_info()->utraps[type] = (long)new_p;
return 0;
}
asmlinkage long sparc_memory_ordering(unsigned long model,
struct pt_regs *regs)
{
if (model >= 3)
return -EINVAL;
regs->tstate = (regs->tstate & ~TSTATE_MM) | (model << 14);
return 0;
}
SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act,
struct sigaction __user *, oact, void __user *, restorer,
size_t, sigsetsize)
{
struct k_sigaction new_ka, old_ka;
int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (act) {
new_ka.ka_restorer = restorer;
if (copy_from_user(&new_ka.sa, act, sizeof(*act)))
return -EFAULT;
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
if (copy_to_user(oact, &old_ka.sa, sizeof(*oact)))
return -EFAULT;
}
return ret;
}
sparc64: Make montmul/montsqr/mpmul usable in 32-bit threads. The Montgomery Multiply, Montgomery Square, and Multiple-Precision Multiply instructions work by loading a combination of the floating point and multiple register windows worth of integer registers with the inputs. These values are 64-bit. But for 32-bit userland processes we only save the low 32-bits of each integer register during a register spill. This is because the register window save area is in the user stack and has a fixed layout. Therefore, the only way to use these instruction in 32-bit mode is to perform the following sequence: 1) Load the top-32bits of a choosen integer register with a sentinel, say "-1". This will be in the outer-most register window. The idea is that we're trying to see if the outer-most register window gets spilled, and thus the 64-bit values were truncated. 2) Load all the inputs for the montmul/montsqr/mpmul instruction, down to the inner-most register window. 3) Execute the opcode. 4) Traverse back up to the outer-most register window. 5) Check the sentinel, if it's still "-1" store the results. Otherwise retry the entire sequence. This retry is extremely troublesome. If you're just unlucky and an interrupt or other trap happens, it'll push that outer-most window to the stack and clear the sentinel when we restore it. We could retry forever and never make forward progress if interrupts arrive at a fast enough rate (consider perf events as one example). So we have do limited retries and fallback to software which is extremely non-deterministic. Luckily it's very straightforward to provide a mechanism to let 32-bit applications use a 64-bit stack. Stacks in 64-bit mode are biased by 2047 bytes, which means that the lowest bit is set in the actual %sp register value. So if we see bit zero set in a 32-bit application's stack we treat it like a 64-bit stack. Runtime detection of such a facility is tricky, and cumbersome at best. For example, just trying to use a biased stack and seeing if it works is hard to recover from (the signal handler will need to use an alt stack, plus something along the lines of longjmp). Therefore, we add a system call to report a bitmask of arch specific features like this in a cheap and less hairy way. With help from Andy Polyakov. Signed-off-by: David S. Miller <davem@davemloft.net>
2012-10-27 06:18:37 +08:00
asmlinkage long sys_kern_features(void)
{
return KERN_FEATURE_MIXED_MODE_STACK;
}