mirror of https://gitee.com/openkylin/linux.git
threads-v5.9
-----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCXygcLwAKCRCRxhvAZXjc ohajAP4n5E3BmN0jpIviXT4eNhP62jzxJtxlVXtgGT3D8b1mpQEA5n8NSOlQLoAh yUGsjtwR9xDcHMcrhXD3yN6eYJSK0A8= =tn4R -----END PGP SIGNATURE----- Merge tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux Pull thread updates from Christian Brauner: "This contains the changes to add the missing support for attaching to time namespaces via pidfds. Last cycle setns() was changed to support attaching to multiple namespaces atomically. This requires all namespaces to have a point of no return where they can't fail anymore. Specifically, <namespace-type>_install() is allowed to perform permission checks and install the namespace into the new struct nsset that it has been given but it is not allowed to make visible changes to the affected task. Once <namespace-type>_install() returns, anything that the given namespace type additionally requires to be setup needs to ideally be done in a function that can't fail or if it fails the failure must be non-fatal. For time namespaces the relevant functions that fell into this category were timens_set_vvar_page() and vdso_join_timens(). The latter could still fail although it didn't need to. This function is only implemented for vdso_join_timens() in current mainline. As discussed on-list (cf. [1]), in order to make setns() support time namespaces when attaching to multiple namespaces at once properly we changed vdso_join_timens() to always succeed. So vdso_join_timens() replaces the mmap_write_lock_killable() with mmap_read_lock(). Please note that arm is about to grow vdso support for time namespaces (possibly this merge window). We've synced on this change and arm64 also uses mmap_read_lock(), i.e. makes vdso_join_timens() a function that can't fail. Once the changes here and the arm64 changes have landed, vdso_join_timens() should be turned into a void function so it's obvious to callers and implementers on other architectures that the expectation is that it can't fail. We didn't do this right away because it would've introduced unnecessary merge conflicts between the two trees for no major gain. As always, tests included" [1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein * tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: tests: add CLONE_NEWTIME setns tests nsproxy: support CLONE_NEWTIME with setns() timens: add timens_commit() helper timens: make vdso_join_timens() always succeed
This commit is contained in:
commit
0a72761b27
|
@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|||
struct mm_struct *mm = task->mm;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
if (mmap_write_lock_killable(mm))
|
||||
return -EINTR;
|
||||
mmap_read_lock(mm);
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
|
@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|||
zap_page_range(vma, vma->vm_start, size);
|
||||
}
|
||||
|
||||
mmap_write_unlock(mm);
|
||||
mmap_read_unlock(mm);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
|
|
|
@ -33,6 +33,7 @@ extern struct time_namespace init_time_ns;
|
|||
#ifdef CONFIG_TIME_NS
|
||||
extern int vdso_join_timens(struct task_struct *task,
|
||||
struct time_namespace *ns);
|
||||
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
|
||||
|
||||
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
|
||||
{
|
||||
|
@ -96,6 +97,11 @@ static inline int vdso_join_timens(struct task_struct *task,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline void timens_commit(struct task_struct *tsk,
|
||||
struct time_namespace *ns)
|
||||
{
|
||||
}
|
||||
|
||||
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
|
||||
{
|
||||
return NULL;
|
||||
|
|
|
@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
|
|||
static int check_setns_flags(unsigned long flags)
|
||||
{
|
||||
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
|
||||
CLONE_NEWCGROUP)))
|
||||
CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
|
||||
CLONE_NEWPID | CLONE_NEWCGROUP)))
|
||||
return -EINVAL;
|
||||
|
||||
#ifndef CONFIG_USER_NS
|
||||
|
@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
|
|||
if (flags & CLONE_NEWNET)
|
||||
return -EINVAL;
|
||||
#endif
|
||||
#ifndef CONFIG_TIME_NS
|
||||
if (flags & CLONE_NEWTIME)
|
||||
return -EINVAL;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TIME_NS
|
||||
if (flags & CLONE_NEWTIME) {
|
||||
ret = validate_ns(nsset, &nsp->time_ns->ns);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
|
||||
out:
|
||||
if (pid_ns)
|
||||
put_pid_ns(pid_ns);
|
||||
|
@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
|
|||
exit_sem(me);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TIME_NS
|
||||
if (flags & CLONE_NEWTIME)
|
||||
timens_commit(me, nsset->nsproxy->time_ns);
|
||||
#endif
|
||||
|
||||
/* transfer ownership */
|
||||
switch_task_namespaces(me, nsset->nsproxy);
|
||||
nsset->nsproxy = NULL;
|
||||
|
|
|
@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns)
|
|||
put_time_ns(to_time_ns(ns));
|
||||
}
|
||||
|
||||
void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
|
||||
{
|
||||
timens_set_vvar_page(tsk, ns);
|
||||
vdso_join_timens(tsk, ns);
|
||||
}
|
||||
|
||||
static int timens_install(struct nsset *nsset, struct ns_common *new)
|
||||
{
|
||||
struct nsproxy *nsproxy = nsset->nsproxy;
|
||||
struct time_namespace *ns = to_time_ns(new);
|
||||
int err;
|
||||
|
||||
if (!current_is_single_threaded())
|
||||
return -EUSERS;
|
||||
|
@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
|
|||
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
timens_set_vvar_page(current, ns);
|
||||
|
||||
err = vdso_join_timens(current, ns);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
get_time_ns(ns);
|
||||
put_time_ns(nsproxy->time_ns);
|
||||
nsproxy->time_ns = ns;
|
||||
|
@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
|
|||
{
|
||||
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
|
||||
struct time_namespace *ns = to_time_ns(nsc);
|
||||
int err;
|
||||
|
||||
/* create_new_namespaces() already incremented the ref counter */
|
||||
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
|
||||
return 0;
|
||||
|
||||
timens_set_vvar_page(tsk, ns);
|
||||
|
||||
err = vdso_join_timens(tsk, ns);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
get_time_ns(ns);
|
||||
put_time_ns(nsproxy->time_ns);
|
||||
nsproxy->time_ns = ns;
|
||||
|
||||
timens_commit(tsk, ns);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,10 @@
|
|||
#define P_PIDFD 3
|
||||
#endif
|
||||
|
||||
#ifndef CLONE_NEWTIME
|
||||
#define CLONE_NEWTIME 0x00000080
|
||||
#endif
|
||||
|
||||
#ifndef CLONE_PIDFD
|
||||
#define CLONE_PIDFD 0x00001000
|
||||
#endif
|
||||
|
|
|
@ -32,6 +32,7 @@ enum {
|
|||
PIDFD_NS_NET,
|
||||
PIDFD_NS_CGROUP,
|
||||
PIDFD_NS_PIDCLD,
|
||||
PIDFD_NS_TIME,
|
||||
PIDFD_NS_MAX
|
||||
};
|
||||
|
||||
|
@ -47,6 +48,7 @@ const struct ns_info {
|
|||
[PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
|
||||
[PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
|
||||
[PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
|
||||
[PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, },
|
||||
};
|
||||
|
||||
FIXTURE(current_nsset)
|
||||
|
@ -83,9 +85,49 @@ pid_t create_child(int *pidfd, unsigned flags)
|
|||
return sys_clone3(&args, sizeof(struct clone_args));
|
||||
}
|
||||
|
||||
static bool switch_timens(void)
|
||||
{
|
||||
int fd, ret;
|
||||
|
||||
if (unshare(CLONE_NEWTIME))
|
||||
return false;
|
||||
|
||||
fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
|
||||
if (fd < 0)
|
||||
return false;
|
||||
|
||||
ret = setns(fd, CLONE_NEWTIME);
|
||||
close(fd);
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static ssize_t read_nointr(int fd, void *buf, size_t count)
|
||||
{
|
||||
ssize_t ret;
|
||||
|
||||
do {
|
||||
ret = read(fd, buf, count);
|
||||
} while (ret < 0 && errno == EINTR);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t write_nointr(int fd, const void *buf, size_t count)
|
||||
{
|
||||
ssize_t ret;
|
||||
|
||||
do {
|
||||
ret = write(fd, buf, count);
|
||||
} while (ret < 0 && errno == EINTR);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
FIXTURE_SETUP(current_nsset)
|
||||
{
|
||||
int i, proc_fd, ret;
|
||||
int ipc_sockets[2];
|
||||
char c;
|
||||
|
||||
for (i = 0; i < PIDFD_NS_MAX; i++) {
|
||||
self->nsfds[i] = -EBADF;
|
||||
|
@ -130,6 +172,9 @@ FIXTURE_SETUP(current_nsset)
|
|||
TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
|
||||
}
|
||||
|
||||
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
|
||||
EXPECT_EQ(ret, 0);
|
||||
|
||||
/* Create tasks that will be stopped. */
|
||||
self->child_pid1 = create_child(&self->child_pidfd1,
|
||||
CLONE_NEWUSER | CLONE_NEWNS |
|
||||
|
@ -139,10 +184,27 @@ FIXTURE_SETUP(current_nsset)
|
|||
EXPECT_GE(self->child_pid1, 0);
|
||||
|
||||
if (self->child_pid1 == 0) {
|
||||
close(ipc_sockets[0]);
|
||||
|
||||
if (!switch_timens())
|
||||
_exit(EXIT_FAILURE);
|
||||
|
||||
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
|
||||
_exit(EXIT_FAILURE);
|
||||
|
||||
close(ipc_sockets[1]);
|
||||
|
||||
pause();
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
close(ipc_sockets[1]);
|
||||
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
|
||||
close(ipc_sockets[0]);
|
||||
|
||||
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
|
||||
EXPECT_EQ(ret, 0);
|
||||
|
||||
self->child_pid2 = create_child(&self->child_pidfd2,
|
||||
CLONE_NEWUSER | CLONE_NEWNS |
|
||||
CLONE_NEWCGROUP | CLONE_NEWIPC |
|
||||
|
@ -151,10 +213,24 @@ FIXTURE_SETUP(current_nsset)
|
|||
EXPECT_GE(self->child_pid2, 0);
|
||||
|
||||
if (self->child_pid2 == 0) {
|
||||
close(ipc_sockets[0]);
|
||||
|
||||
if (!switch_timens())
|
||||
_exit(EXIT_FAILURE);
|
||||
|
||||
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
|
||||
_exit(EXIT_FAILURE);
|
||||
|
||||
close(ipc_sockets[1]);
|
||||
|
||||
pause();
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
close(ipc_sockets[1]);
|
||||
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
|
||||
close(ipc_sockets[0]);
|
||||
|
||||
for (i = 0; i < PIDFD_NS_MAX; i++) {
|
||||
char p[100];
|
||||
|
||||
|
|
Loading…
Reference in New Issue