From 8fd00b4d7014b00448eb33cf0590815304769798 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 26 Aug 2009 18:41:16 +0200 Subject: [PATCH 01/12] rlimits: security, add task_struct to setrlimit Add task_struct to task_setrlimit of security_operations to be able to set rlimit of task other than current. Signed-off-by: Jiri Slaby Acked-by: Eric Paris Acked-by: James Morris --- include/linux/security.h | 9 ++++++--- kernel/sys.c | 2 +- security/capability.c | 3 ++- security/security.c | 5 +++-- security/selinux/hooks.c | 7 ++++--- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 0c8819170463..1a3eb5ff4357 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1501,7 +1501,8 @@ struct security_operations { int (*task_setnice) (struct task_struct *p, int nice); int (*task_setioprio) (struct task_struct *p, int ioprio); int (*task_getioprio) (struct task_struct *p); - int (*task_setrlimit) (unsigned int resource, struct rlimit *new_rlim); + int (*task_setrlimit) (struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim); int (*task_setscheduler) (struct task_struct *p, int policy, struct sched_param *lp); int (*task_getscheduler) (struct task_struct *p); @@ -1751,7 +1752,8 @@ void security_task_getsecid(struct task_struct *p, u32 *secid); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); -int security_task_setrlimit(unsigned int resource, struct rlimit *new_rlim); +int security_task_setrlimit(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim); int security_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); int security_task_getscheduler(struct task_struct *p); @@ -2313,7 +2315,8 @@ static inline int security_task_getioprio(struct task_struct *p) return 0; } -static inline int security_task_setrlimit(unsigned int resource, +static inline int security_task_setrlimit(struct task_struct *p, + unsigned int resource, struct rlimit *new_rlim) { return 0; diff --git a/kernel/sys.c b/kernel/sys.c index e83ddbbaf89d..1ba4522689d4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1290,7 +1290,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) return -EPERM; - retval = security_task_setrlimit(resource, &new_rlim); + retval = security_task_setrlimit(current, resource, &new_rlim); if (retval) return retval; diff --git a/security/capability.c b/security/capability.c index 8168e3ecd5bf..7e468263f2de 100644 --- a/security/capability.c +++ b/security/capability.c @@ -412,7 +412,8 @@ static int cap_task_getioprio(struct task_struct *p) return 0; } -static int cap_task_setrlimit(unsigned int resource, struct rlimit *new_rlim) +static int cap_task_setrlimit(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim) { return 0; } diff --git a/security/security.c b/security/security.c index 351942a4ca0e..aa510609a955 100644 --- a/security/security.c +++ b/security/security.c @@ -769,9 +769,10 @@ int security_task_getioprio(struct task_struct *p) return security_ops->task_getioprio(p); } -int security_task_setrlimit(unsigned int resource, struct rlimit *new_rlim) +int security_task_setrlimit(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim) { - return security_ops->task_setrlimit(resource, new_rlim); + return security_ops->task_setrlimit(p, resource, new_rlim); } int security_task_setscheduler(struct task_struct *p, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5c9f25ba1c95..e3ce6b4127cc 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3371,16 +3371,17 @@ static int selinux_task_getioprio(struct task_struct *p) return current_has_perm(p, PROCESS__GETSCHED); } -static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim) +static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, + struct rlimit *new_rlim) { - struct rlimit *old_rlim = current->signal->rlim + resource; + struct rlimit *old_rlim = p->signal->rlim + resource; /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) - return current_has_perm(current, PROCESS__SETRLIMIT); + return current_has_perm(p, PROCESS__SETRLIMIT); return 0; } From 5ab46b345e418747b3a52f0892680c0745c4223c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 28 Aug 2009 14:05:12 +0200 Subject: [PATCH 02/12] rlimits: add task_struct to update_rlimit_cpu Add task_struct as a parameter to update_rlimit_cpu to be able to set rlimit_cpu of different task than current. Signed-off-by: Jiri Slaby Acked-by: James Morris --- include/linux/posix-timers.h | 2 +- kernel/posix-cpu-timers.c | 8 ++++---- kernel/sys.c | 2 +- security/selinux/hooks.c | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4f71bf4e628c..3e23844a6990 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -117,6 +117,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, long clock_nanosleep_restart(struct restart_block *restart_block); -void update_rlimit_cpu(unsigned long rlim_new); +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); #endif diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..0513900995ce 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -16,13 +16,13 @@ * siglock protection since other code may update expiration cache as * well. */ -void update_rlimit_cpu(unsigned long rlim_new) +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { cputime_t cputime = secs_to_cputime(rlim_new); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); } static int check_clock(const clockid_t which_clock) diff --git a/kernel/sys.c b/kernel/sys.c index 1ba4522689d4..f5183b08adfc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1320,7 +1320,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - update_rlimit_cpu(new_rlim.rlim_cur); + update_rlimit_cpu(current, new_rlim.rlim_cur); out: return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e3ce6b4127cc..afb18a9ebba1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2338,7 +2338,8 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } - update_rlimit_cpu(current->signal->rlim[RLIMIT_CPU].rlim_cur); + update_rlimit_cpu(current, + current->signal->rlim[RLIMIT_CPU].rlim_cur); } } From 2fb9d2689a0041b88b25bc3187eada2968e25995 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Sep 2009 19:21:45 +0200 Subject: [PATCH 03/12] rlimits: make sure ->rlim_max never grows in sys_setrlimit Mostly preparation for Jiri's changes, but probably makes sense anyway. sys_setrlimit() checks new_rlim.rlim_max <= old_rlim->rlim_max, but when it takes task_lock() old_rlim->rlim_max can be already lowered. Move this check under task_lock(). Currently this is not important, we can only race with our sub-thread, this means the application is stupid. But when we change the code to allow the update of !current task's limits, it becomes important to make sure ->rlim_max can be lowered "reliably" even if we race with the application doing sys_setrlimit(). Signed-off-by: Oleg Nesterov Signed-off-by: Jiri Slaby --- kernel/sys.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index f5183b08adfc..f2b2d7aa3818 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1283,10 +1283,6 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) return -EFAULT; if (new_rlim.rlim_cur > new_rlim.rlim_max) return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) return -EPERM; @@ -1304,11 +1300,16 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) new_rlim.rlim_cur = 1; } + old_rlim = current->signal->rlim + resource; task_lock(current->group_leader); - *old_rlim = new_rlim; + if (new_rlim.rlim_max > old_rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + else + *old_rlim = new_rlim; task_unlock(current->group_leader); - if (resource != RLIMIT_CPU) + if (retval || resource != RLIMIT_CPU) goto out; /* @@ -1322,7 +1323,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) update_rlimit_cpu(current, new_rlim.rlim_cur); out: - return 0; + return retval; } /* From eb2d55a32b9a91bca0dea299eedb560bafa8b14e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Jun 2010 22:43:32 +0200 Subject: [PATCH 04/12] rlimits: selinux, do rlimits changes under task_lock When doing an exec, selinux updates rlimits in its code of current process depending on current max. Make sure max or cur doesn't change in the meantime by grabbing task_lock which do_prlimit needs for changing limits too. While at it, use rlimit helper for accessing CPU rlimit a line below. To have a volatile access too. Signed-off-by: Jiri Slaby Cc: Oleg Nesterov --- security/selinux/hooks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index afb18a9ebba1..2a8a0a915ff3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2333,13 +2333,15 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { + /* protect against do_prlimit() */ + task_lock(current); for (i = 0; i < RLIM_NLIMITS; i++) { rlim = current->signal->rlim + i; initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } - update_rlimit_cpu(current, - current->signal->rlim[RLIMIT_CPU].rlim_cur); + task_unlock(current); + update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } From 7855c35da7ba16b389d17710401c4a55a3ea2102 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 26 Aug 2009 23:45:34 +0200 Subject: [PATCH 05/12] rlimits: split sys_setrlimit Create do_setrlimit from sys_setrlimit and declare do_setrlimit in the resource header. This is the first phase to have generic do_prlimit which allows to be called from read, write and compat rlimits code. The new do_setrlimit also accepts a task pointer to change the limits of. Currently, it cannot be other than current, but this will change with locking later. Also pass tsk->group_leader to security_task_setrlimit to check whether current is allowed to change rlimits of the process and not its arbitrary thread because it makes more sense given that rlimit are per process and not per-thread. Signed-off-by: Jiri Slaby --- include/linux/resource.h | 2 ++ kernel/sys.c | 40 ++++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/include/linux/resource.h b/include/linux/resource.h index f1e914eefeab..cf8dc96653ee 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -73,6 +73,8 @@ struct rlimit { struct task_struct; int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +int do_setrlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim); #endif /* __KERNEL__ */ diff --git a/kernel/sys.c b/kernel/sys.c index f2b2d7aa3818..b5b96e30e0d6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1272,42 +1272,41 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +int do_setrlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim) { - struct rlimit new_rlim, *old_rlim; + struct rlimit *old_rlim; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) + if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) + if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; - retval = security_task_setrlimit(current, resource, &new_rlim); + retval = security_task_setrlimit(tsk->group_leader, resource, new_rlim); if (retval) return retval; - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* * The caller is asking for an immediate RLIMIT_CPU * expiry. But we use the zero value to mean "it was * never set". So let's cheat and make it one second * instead */ - new_rlim.rlim_cur = 1; + new_rlim->rlim_cur = 1; } - old_rlim = current->signal->rlim + resource; - task_lock(current->group_leader); - if (new_rlim.rlim_max > old_rlim->rlim_max && + old_rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim->rlim_max > old_rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; else - *old_rlim = new_rlim; - task_unlock(current->group_leader); + *old_rlim = *new_rlim; + task_unlock(tsk->group_leader); if (retval || resource != RLIMIT_CPU) goto out; @@ -1318,14 +1317,23 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim.rlim_cur == RLIM_INFINITY) + if (new_rlim->rlim_cur == RLIM_INFINITY) goto out; - update_rlimit_cpu(current, new_rlim.rlim_cur); + update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: return retval; } +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_setrlimit(current, resource, &new_rlim); +} + /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After From 1c1e618ddd15f69fd87ccea596769f78c8065504 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 28 Aug 2009 14:08:17 +0200 Subject: [PATCH 06/12] rlimits: allow setrlimit to non-current tasks Add locking to allow setrlimit accept task parameter other than current. Namely, lock tasklist_lock for read and check whether the task structure has sighand non-null. Do all the signal processing under that lock still held. There are some points: 1) security_task_setrlimit is now called with that lock held. This is not new, many security_* functions are called with this lock held already so it doesn't harm (all this security_* stuff does almost the same). 2) task->sighand->siglock (in update_rlimit_cpu) is nested in tasklist_lock. This dependence is already existing. 3) tsk->alloc_lock is nested in tasklist_lock. This is OK too, already existing dependence. Signed-off-by: Jiri Slaby Cc: Oleg Nesterov --- kernel/sys.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sys.c b/kernel/sys.c index b5b96e30e0d6..9dbcbbcce153 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1272,6 +1272,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif +/* make sure you are allowed to change @tsk limits before calling this */ int do_setrlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim) { @@ -1285,9 +1286,16 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; + /* protect tsk->signal and tsk->sighand from disappearing */ + read_lock(&tasklist_lock); + if (!tsk->sighand) { + retval = -ESRCH; + goto out; + } + retval = security_task_setrlimit(tsk->group_leader, resource, new_rlim); if (retval) - return retval; + goto out; if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* @@ -1322,6 +1330,7 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: + read_unlock(&tasklist_lock); return retval; } From 86f162f4c75ceb6daf43165469eeeca1bc3d4639 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sat, 14 Nov 2009 17:37:04 +0100 Subject: [PATCH 07/12] rlimits: do security check under task_lock Do security_task_setrlimit under task_lock. Other tasks may change limits under our hands while we are checking limits inside the function. From now on, they can't. Note that all the security work is done under a spinlock here now. Security hooks count with that, they are called from interrupt context (like security_task_kill) and with spinlocks already held (e.g. capable->security_capable). Signed-off-by: Jiri Slaby Acked-by: James Morris Cc: Heiko Carstens --- kernel/sys.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 9dbcbbcce153..c762eebdebf7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1277,7 +1277,7 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim) { struct rlimit *old_rlim; - int retval; + int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; @@ -1293,9 +1293,14 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, goto out; } - retval = security_task_setrlimit(tsk->group_leader, resource, new_rlim); - if (retval) - goto out; + old_rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim->rlim_max > old_rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, resource, + new_rlim); if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* @@ -1307,12 +1312,7 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, new_rlim->rlim_cur = 1; } - old_rlim = tsk->signal->rlim + resource; - task_lock(tsk->group_leader); - if (new_rlim->rlim_max > old_rlim->rlim_max && - !capable(CAP_SYS_RESOURCE)) - retval = -EPERM; - else + if (!retval) *old_rlim = *new_rlim; task_unlock(tsk->group_leader); From 6a1d5e2c85d06da35cdfd93f1a27675bfdc3ad8c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 24 Mar 2010 17:06:58 +0100 Subject: [PATCH 08/12] rlimits: add rlimit64 structure Add a platform independent structure for resource limits to use with a new prlimit64 syscall. This structure is the same which uses glibc for 64-bit limits. Also add corresponding infinity which is a 64-bit full of bit-ones. Signed-off-by: Jiri Slaby --- include/linux/resource.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/resource.h b/include/linux/resource.h index cf8dc96653ee..037aa7e6335d 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -43,6 +43,13 @@ struct rlimit { unsigned long rlim_max; }; +#define RLIM64_INFINITY (~0ULL) + +struct rlimit64 { + __u64 rlim_cur; + __u64 rlim_max; +}; + #define PRIO_MIN (-20) #define PRIO_MAX 20 From 5b41535aac0c07135ff6a4c5c2ae115d1c20c0bc Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 24 Mar 2010 16:11:29 +0100 Subject: [PATCH 09/12] rlimits: redo do_setrlimit to more generic do_prlimit It now allows also reading of limits. I.e. all read and writes will later use this function. It takes two parameters, new and old limits which can be both NULL. If new is non-NULL, the value in it is set to rlimits. If old is non-NULL, current rlimits are stored there. If both are non-NULL, old are stored prior to setting the new ones, atomically. (Similar to sigaction.) Signed-off-by: Jiri Slaby --- include/linux/resource.h | 4 +-- kernel/sys.c | 71 +++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/include/linux/resource.h b/include/linux/resource.h index 037aa7e6335d..88d36f9145ba 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -80,8 +80,8 @@ struct rlimit64 { struct task_struct; int getrusage(struct task_struct *p, int who, struct rusage __user *ru); -int do_setrlimit(struct task_struct *tsk, unsigned int resource, - struct rlimit *new_rlim); +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim); #endif /* __KERNEL__ */ diff --git a/kernel/sys.c b/kernel/sys.c index c762eebdebf7..bc7d1be0960e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1273,18 +1273,21 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif /* make sure you are allowed to change @tsk limits before calling this */ -int do_setrlimit(struct task_struct *tsk, unsigned int resource, - struct rlimit *new_rlim) +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) { - struct rlimit *old_rlim; + struct rlimit *rlim; int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (new_rlim->rlim_cur > new_rlim->rlim_max) - return -EINVAL; - if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) - return -EPERM; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; + } /* protect tsk->signal and tsk->sighand from disappearing */ read_lock(&tasklist_lock); @@ -1293,42 +1296,42 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource, goto out; } - old_rlim = tsk->signal->rlim + resource; + rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); - if (new_rlim->rlim_max > old_rlim->rlim_max && - !capable(CAP_SYS_RESOURCE)) - retval = -EPERM; - if (!retval) - retval = security_task_setrlimit(tsk->group_leader, resource, - new_rlim); - - if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim->rlim_cur = 1; + if (new_rlim) { + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, + resource, new_rlim); + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim->rlim_cur = 1; + } + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; } - - if (!retval) - *old_rlim = *new_rlim; task_unlock(tsk->group_leader); - if (retval || resource != RLIMIT_CPU) - goto out; - /* * RLIMIT_CPU handling. Note that the kernel fails to return an error * code if it rejected the user's attempt to set RLIMIT_CPU. This is a * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim->rlim_cur == RLIM_INFINITY) - goto out; - - update_rlimit_cpu(tsk, new_rlim->rlim_cur); + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY) + update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: read_unlock(&tasklist_lock); return retval; @@ -1340,7 +1343,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - return do_setrlimit(current, resource, &new_rlim); + return do_prlimit(current, resource, &new_rlim, NULL); } /* From b95183453af2ed14a5c7027e58049c9fd17e92ce Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 4 May 2010 11:28:25 +0200 Subject: [PATCH 10/12] rlimits: switch more rlimit syscalls to do_prlimit After we added more generic do_prlimit, switch sys_getrlimit to that. Also switch compat handling, so we can get rid of ugly __user casts and avoid setting process' address limit to kernel data and back. Signed-off-by: Jiri Slaby --- kernel/compat.c | 17 +++-------------- kernel/sys.c | 17 ++++++++--------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/kernel/compat.c b/kernel/compat.c index 5adab05a3172..e167efce8423 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs (); - - if (resource >= RLIM_NLIMITS) - return -EINVAL; if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || __get_user(r.rlim_cur, &rlim->rlim_cur) || @@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, r.rlim_cur = RLIM_INFINITY; if (r.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; - set_fs(KERNEL_DS); - ret = sys_setrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); - return ret; + return do_prlimit(current, resource, &r, NULL); } #ifdef COMPAT_RLIM_OLD_INFINITY @@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit (unsigned int resource, +asmlinkage long compat_sys_getrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; int ret; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_getrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); + ret = do_prlimit(current, resource, NULL, &r); if (!ret) { if (r.rlim_cur > COMPAT_RLIM_INFINITY) r.rlim_cur = COMPAT_RLIM_INFINITY; diff --git a/kernel/sys.c b/kernel/sys.c index bc7d1be0960e..9da98dd47276 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1236,15 +1236,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } + struct rlimit value; + int ret; + + ret = do_prlimit(current, resource, NULL, &value); + if (!ret) + ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + + return ret; } #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT From c022a0acad534fd5f5d5f17280f6d4d135e74e81 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 4 May 2010 18:03:50 +0200 Subject: [PATCH 11/12] rlimits: implement prlimit64 syscall This patch adds the code to support the sys_prlimit64 syscall which modifies-and-returns the rlim values of a selected process atomically. The first parameter, pid, being 0 means current process. Unlike the current implementation, it is a generic interface, architecture indepentent so that we needn't handle compat stuff anymore. In the future, after glibc start to use this we can deprecate sys_setrlimit and sys_getrlimit in favor to clean up the code finally. It also adds a possibility of changing limits of other processes. We check the user's permissions to do that and if it succeeds, the new limits are propagated online. This is good for large scale applications such as SAP or databases where administrators need to change limits time by time (e.g. on crashes increase core size). And it is unacceptable to restart the service. For safety, all rlim users now either use accessors or doesn't need them due to - locking - the fact a process was just forked and nobody else knows about it yet (and nobody can't thus read/write limits) hence it is safe to modify limits now. The limitation is that we currently stay at ulong internal representation. So the rlim64_is_infinity check is used where value is compared against ULONG_MAX on 32-bit which is the maximum value there. And since internally the limits are held in struct rlimit, converters which are used before and after do_prlimit call in sys_prlimit64 are introduced. Signed-off-by: Jiri Slaby --- include/linux/syscalls.h | 4 ++ kernel/sys.c | 94 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7f614ce274a9..a60943be4270 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -35,6 +35,7 @@ struct oldold_utsname; struct old_utsname; struct pollfd; struct rlimit; +struct rlimit64; struct rusage; struct sched_param; struct sel_arg_struct; @@ -644,6 +645,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r #endif asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim); +asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource, + const struct rlimit64 __user *new_rlim, + struct rlimit64 __user *old_rlim); asmlinkage long sys_getrusage(int who, struct rusage __user *ru); asmlinkage long sys_umask(int mask); diff --git a/kernel/sys.c b/kernel/sys.c index 9da98dd47276..e9ad44489828 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1271,6 +1271,39 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif +static inline bool rlim64_is_infinity(__u64 rlim64) +{ +#if BITS_PER_LONG < 64 + return rlim64 >= ULONG_MAX; +#else + return rlim64 == RLIM64_INFINITY; +#endif +} + +static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) +{ + if (rlim->rlim_cur == RLIM_INFINITY) + rlim64->rlim_cur = RLIM64_INFINITY; + else + rlim64->rlim_cur = rlim->rlim_cur; + if (rlim->rlim_max == RLIM_INFINITY) + rlim64->rlim_max = RLIM64_INFINITY; + else + rlim64->rlim_max = rlim->rlim_max; +} + +static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) +{ + if (rlim64_is_infinity(rlim64->rlim_cur)) + rlim->rlim_cur = RLIM_INFINITY; + else + rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; + if (rlim64_is_infinity(rlim64->rlim_max)) + rlim->rlim_max = RLIM_INFINITY; + else + rlim->rlim_max = (unsigned long)rlim64->rlim_max; +} + /* make sure you are allowed to change @tsk limits before calling this */ int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim) @@ -1336,6 +1369,67 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, return retval; } +/* rcu lock must be held */ +static int check_prlimit_permission(struct task_struct *task) +{ + const struct cred *cred = current_cred(), *tcred; + + tcred = __task_cred(task); + if ((cred->uid != tcred->euid || + cred->uid != tcred->suid || + cred->uid != tcred->uid || + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && + !capable(CAP_SYS_RESOURCE)) { + return -EPERM; + } + + return 0; +} + +SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, + const struct rlimit64 __user *, new_rlim, + struct rlimit64 __user *, old_rlim) +{ + struct rlimit64 old64, new64; + struct rlimit old, new; + struct task_struct *tsk; + int ret; + + if (new_rlim) { + if (copy_from_user(&new64, new_rlim, sizeof(new64))) + return -EFAULT; + rlim64_to_rlim(&new64, &new); + } + + rcu_read_lock(); + tsk = pid ? find_task_by_vpid(pid) : current; + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + ret = check_prlimit_permission(tsk); + if (ret) { + rcu_read_unlock(); + return ret; + } + get_task_struct(tsk); + rcu_read_unlock(); + + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + + if (!ret && old_rlim) { + rlim_to_rlim64(&old, &old64); + if (copy_to_user(old_rlim, &old64, sizeof(old64))) + ret = -EFAULT; + } + + put_task_struct(tsk); + return ret; +} + SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim; From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 6 May 2010 20:17:00 +0200 Subject: [PATCH 12/12] unistd: add __NR_prlimit64 syscall numbers Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to asm-x86, both 32 and 64-bit. Signed-off-by: Jiri Slaby Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + include/asm-generic/unistd.h | 4 +++- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa3..a88e31d1836e 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -842,4 +842,5 @@ ia32_sys_call_table: .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open .quad compat_sys_recvmmsg + .quad sys_prlimit64 ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a4..35e0cb151b69 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -343,10 +343,11 @@ #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 +#define __NR_prlimit64 338 #ifdef __KERNEL__ -#define NR_syscalls 338 +#define NR_syscalls 339 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81e..570bf5eae564 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 299 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_prlimit64 300 +__SYSCALL(__NR_prlimit64, sys_prlimit64) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..eca1d7d23ab5 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_prlimit64 diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 6a0b30f78a62..0dfd517e5ec9 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -626,9 +626,11 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_prlimit64 244 +__SYSCALL(__NR_prlimit64, sys_prlimit64) #undef __NR_syscalls -#define __NR_syscalls 244 +#define __NR_syscalls 245 /* * All syscalls below here should go away really,