Unverified Commit d5827bc0 by Stéphane Graber Committed by GitHub

Merge pull request #3464 from brauner/2020-06-19/clone_into_cgroup

lxc: support CLONE_INTO_CGROUP
parents 3a9018bb f7176c3e
...@@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops, ...@@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
struct hierarchy *h = ops->hierarchies[i]; struct hierarchy *h = ops->hierarchies[i];
int ret; int ret;
if (is_unified_hierarchy(h) && handler->clone_flags & CLONE_INTO_CGROUP)
continue;
ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len); ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
if (ret != 0) if (ret != 0)
return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path); return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
......
...@@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops) ...@@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops)
return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED; return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
} }
static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
{
if (!ops->unified)
return -EBADF;
return ops->unified->cgfd_con;
}
#endif #endif
...@@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc); ...@@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc);
* The nice thing about this is that we get fork() behavior. That is * The nice thing about this is that we get fork() behavior. That is
* lxc_raw_clone() returns 0 in the child and the child pid in the parent. * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
*/ */
__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd) __returns_twice pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd)
{ {
#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
...@@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd) ...@@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0); pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0);
if (pid < 0 && errno == ENOSYS) { if (pid < 0 && errno == ENOSYS) {
SYSTRACE("Falling back to legacy clone"); SYSTRACE("Falling back to legacy clone");
return __lxc_raw_clone(flags, pidfd); return lxc_raw_legacy_clone(flags, pidfd);
} }
return pid; return pid;
......
...@@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd); ...@@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
* The child must use lxc_raw_getpid() to retrieve its pid. * The child must use lxc_raw_getpid() to retrieve its pid.
*/ */
extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd); extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
extern pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd);
/* /*
* lxc_raw_clone_cb() - create a new process * lxc_raw_clone_cb() - create a new process
......
...@@ -1081,8 +1081,7 @@ static int do_start(void *data) ...@@ -1081,8 +1081,7 @@ static int do_start(void *data)
/* Unshare CLONE_NEWNET after CLONE_NEWUSER. See /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
* https://github.com/lxc/lxd/issues/1978. * https://github.com/lxc/lxd/issues/1978.
*/ */
if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) == if (handler->ns_unshare_flags & CLONE_NEWNET) {
(CLONE_NEWNET | CLONE_NEWUSER)) {
ret = unshare(CLONE_NEWNET); ret = unshare(CLONE_NEWNET);
if (ret < 0) { if (ret < 0) {
SYSERROR("Failed to unshare CLONE_NEWNET"); SYSERROR("Failed to unshare CLONE_NEWNET");
...@@ -1190,7 +1189,7 @@ static int do_start(void *data) ...@@ -1190,7 +1189,7 @@ static int do_start(void *data)
* *
* 8:cpuset:/ * 8:cpuset:/
*/ */
if (handler->ns_clone_flags & CLONE_NEWCGROUP) { if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
ret = unshare(CLONE_NEWCGROUP); ret = unshare(CLONE_NEWCGROUP);
if (ret < 0) { if (ret < 0) {
if (errno != EINVAL) { if (errno != EINVAL) {
...@@ -1205,7 +1204,7 @@ static int do_start(void *data) ...@@ -1205,7 +1204,7 @@ static int do_start(void *data)
} }
} }
if (handler->ns_clone_flags & CLONE_NEWTIME) { if (handler->ns_unshare_flags & CLONE_NEWTIME) {
ret = unshare(CLONE_NEWTIME); ret = unshare(CLONE_NEWTIME);
if (ret < 0) { if (ret < 0) {
if (errno != EINVAL) { if (errno != EINVAL) {
...@@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler) ...@@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler)
if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag)) if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets"); return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
/* Deal with namespaces that are unshared. */
if (handler->ns_clone_flags & CLONE_NEWTIME)
handler->ns_unshare_flags |= CLONE_NEWTIME;
if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
handler->ns_unshare_flags |= CLONE_NEWCGROUP;
if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
(CLONE_NEWNET | CLONE_NEWUSER))
handler->ns_unshare_flags |= CLONE_NEWNET;
/* Deal with namespaces that are spawned. */
handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
return 0; return 0;
} }
...@@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler)
} }
/* Create a process in a new set of namespaces. */ /* Create a process in a new set of namespaces. */
handler->ns_on_clone_flags = handler->ns_clone_flags;
if (handler->ns_clone_flags & CLONE_NEWUSER) {
/* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
* clone a new user namespace first and only later unshare our
* network namespace to ensure that network devices ownership is
* set up correctly.
*/
handler->ns_on_clone_flags &= ~CLONE_NEWNET;
}
/* The cgroup namespace gets unshare()ed not clone()ed. */
handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
/* The time namespace (currently) gets unshare()ed not clone()ed. */
handler->ns_on_clone_flags &= ~CLONE_NEWTIME;
if (share_ns) { if (share_ns) {
pid_t attacher_pid; pid_t attacher_pid;
...@@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler)
SYSERROR("Intermediate process failed"); SYSERROR("Intermediate process failed");
goto out_delete_net; goto out_delete_net;
} }
if (handler->pid < 0) {
SYSERROR(LXC_CLONE_ERROR);
goto out_delete_net;
}
} else { } else {
handler->pid = lxc_raw_clone_cb(do_start, handler, int cgroup_fd;
CLONE_PIDFD | handler->ns_on_clone_flags,
&handler->pidfd); struct lxc_clone_args clone_args = {
} .flags = handler->clone_flags,
if (handler->pid < 0) { .pidfd = ptr_to_u64(&handler->pidfd),
SYSERROR(LXC_CLONE_ERROR); .exit_signal = SIGCHLD,
goto out_delete_net; };
if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
cgroup_fd = cgroup_unified_fd(cgroup_ops);
if (cgroup_fd >= 0) {
handler->clone_flags |= CLONE_INTO_CGROUP;
clone_args.flags |= CLONE_INTO_CGROUP;
clone_args.cgroup = cgroup_fd;
}
}
/* Try to spawn directly into target cgroup. */
handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
if (handler->pid < 0) {
SYSTRACE("Failed to spawn container directly into target cgroup");
/* Kernel might simply be too old for CLONE_INTO_CGROUP. */
handler->clone_flags &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
handler->ns_unshare_flags |= CLONE_NEWCGROUP;
clone_args.flags = handler->clone_flags;
handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
} else if (cgroup_fd >= 0) {
TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
}
/* Kernel might be too old for clone3(). */
if (handler->pid < 0) {
SYSTRACE("Failed to spawn container via clone3()");
handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
}
if (handler->pid < 0) {
SYSERROR(LXC_CLONE_ERROR);
goto out_delete_net;
}
if (handler->pid == 0) {
(void)do_start(handler);
_exit(EXIT_FAILURE);
}
} }
if (handler->pidfd < 0)
handler->clone_flags &= ~CLONE_PIDFD;
TRACE("Cloned child process %d", handler->pid); TRACE("Cloned child process %d", handler->pid);
/* Verify that we can actually make use of pidfds. */ /* Verify that we can actually make use of pidfds. */
...@@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler)
} }
TRACE("Set up cgroup2 device controller limits"); TRACE("Set up cgroup2 device controller limits");
if (handler->ns_clone_flags & CLONE_NEWCGROUP) { if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */ /* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "cgroup"); ret = lxc_try_preserve_ns(handler->pid, "cgroup");
if (ret < 0) { if (ret < 0) {
...@@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler)
cgroup_ops->payload_finalize(cgroup_ops); cgroup_ops->payload_finalize(cgroup_ops);
TRACE("Finished setting up cgroups"); TRACE("Finished setting up cgroups");
if (handler->ns_clone_flags & CLONE_NEWTIME) { if (handler->ns_unshare_flags & CLONE_NEWTIME) {
/* Now we're ready to preserve the cgroup namespace */ /* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "time"); ret = lxc_try_preserve_ns(handler->pid, "time");
if (ret < 0) { if (ret < 0) {
......
...@@ -26,20 +26,18 @@ struct lxc_handler { ...@@ -26,20 +26,18 @@ struct lxc_handler {
* list the clone flags that were unshare()ed rather then clone()ed * list the clone flags that were unshare()ed rather then clone()ed
* because of ordering requirements (e.g. e.g. CLONE_NEWNET and * because of ordering requirements (e.g. e.g. CLONE_NEWNET and
* CLONE_NEWUSER) or implementation details. * CLONE_NEWUSER) or implementation details.
*
* @ns_keep_flags;
* - The clone flags for the namespaces that the container will inherit
* from the parent. They are not recorded in the handler itself but
* are present in the container's config.
* *
* @ns_share_flags; * @ns_unshare_flags
* - The clone flags for the namespaces that the container will share * - Flags for namespaces that were unshared, not cloned.
* with another process. They are not recorded in the handler itself *
* but are present in the container's config. * @clone_flags
* - ns_on_clone flags | other flags used to create container.
*/ */
struct /* lxc_ns */ { struct /* lxc_ns */ {
int ns_clone_flags; unsigned int ns_clone_flags;
int ns_on_clone_flags; unsigned int ns_on_clone_flags;
unsigned int ns_unshare_flags;
unsigned int clone_flags;
}; };
/* File descriptor to pin the rootfs for privileged containers. */ /* File descriptor to pin the rootfs for privileged containers. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment