Merge pull request #3464 from brauner/2020-06-19/clone_into_cgroup

lxc: support CLONE_INTO_CGROUP

Merge pull request #3464 from brauner/2020-06-19/clone_into_cgroup
d5827bc0 · Stéphane Graber · GitHub · 3a9018bb · f7176c3e · d5827bc0
Unverified Commit d5827bc0 authored Jun 29, 2020 by Stéphane Graber Committed by GitHub Jun 29, 2020
6 changed files
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
 		struct hierarchy *h = ops->hierarchies[i];
 		int ret;
+		if (is_unified_hierarchy(h) && handler->clone_flags & CLONE_INTO_CGROUP)
+			continue;
 		ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
 		if (ret != 0)
 			return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);

--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops)
 	return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
 }
+static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
+{
+	if (!ops->unified)
+		return -EBADF;
+	return ops->unified->cgfd_con;
+}
 #endif
--- a/src/lxc/process_utils.c
+++ b/src/lxc/process_utils.c
@@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc);
 * The nice thing about this is that we get fork() behavior. That is
 * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
 */
-__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd)
+__returns_twice pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd)
 {
 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
@@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
 	pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0);
 	if (pid < 0 && errno == ENOSYS) {
 		SYSTRACE("Falling back to legacy clone");
-		return __lxc_raw_clone(flags, pidfd);
+		return lxc_raw_legacy_clone(flags, pidfd);
 	}
 	return pid;

--- a/src/lxc/process_utils.h
+++ b/src/lxc/process_utils.h
@@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
 *   The child must use lxc_raw_getpid() to retrieve its pid.
 */
 extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
+extern pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd);
 /*
 * lxc_raw_clone_cb() - create a new process

--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1081,8 +1081,7 @@ static int do_start(void *data)
 	/* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
 	 * https://github.com/lxc/lxd/issues/1978.
 	 */
-	if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+	if (handler->ns_unshare_flags & CLONE_NEWNET) {
-	    (CLONE_NEWNET | CLONE_NEWUSER)) {
 		ret = unshare(CLONE_NEWNET);
 		if (ret < 0) {
 			SYSERROR("Failed to unshare CLONE_NEWNET");
@@ -1190,7 +1189,7 @@ static int do_start(void *data)
 	 *
 	 *	8:cpuset:/
 	 */
-	if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+	if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
 		ret = unshare(CLONE_NEWCGROUP);
 		if (ret < 0) {
 			if (errno != EINVAL) {
@@ -1205,7 +1204,7 @@ static int do_start(void *data)
 		}
 	}
-	if (handler->ns_clone_flags & CLONE_NEWTIME) {
+	if (handler->ns_unshare_flags & CLONE_NEWTIME) {
 		ret = unshare(CLONE_NEWTIME);
 		if (ret < 0) {
 			if (errno != EINVAL) {
@@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler)
 	if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
 		return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
+	/* Deal with namespaces that are unshared. */
+	if (handler->ns_clone_flags & CLONE_NEWTIME)
+		handler->ns_unshare_flags |= CLONE_NEWTIME;
+	if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
+		handler->ns_unshare_flags |= CLONE_NEWCGROUP;
+	if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+	    (CLONE_NEWNET | CLONE_NEWUSER))
+		handler->ns_unshare_flags |= CLONE_NEWNET;
+	/* Deal with namespaces that are spawned. */
+	handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
+	handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
 	return 0;
 }
@@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler)
 	}
 	/* Create a process in a new set of namespaces. */
-	handler->ns_on_clone_flags = handler->ns_clone_flags;
-	if (handler->ns_clone_flags & CLONE_NEWUSER) {
-		/* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
-		 * clone a new user namespace first and only later unshare our
-		 * network namespace to ensure that network devices ownership is
-		 * set up correctly.
-		 */
-		handler->ns_on_clone_flags &= ~CLONE_NEWNET;
-	}
-	/* The cgroup namespace gets unshare()ed not clone()ed. */
-	handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
-	/* The time namespace (currently) gets unshare()ed not clone()ed. */
-	handler->ns_on_clone_flags &= ~CLONE_NEWTIME;
 	if (share_ns) {
 		pid_t attacher_pid;
@@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler)
 			SYSERROR("Intermediate process failed");
 			goto out_delete_net;
 		}
+		if (handler->pid < 0) {
+			SYSERROR(LXC_CLONE_ERROR);
+			goto out_delete_net;
+		}
 	} else {
-		handler->pid = lxc_raw_clone_cb(do_start, handler,
+		int cgroup_fd;
-						CLONE_PIDFD | handler->ns_on_clone_flags,
-						&handler->pidfd);
+		struct lxc_clone_args clone_args = {
-	}
+			.flags = handler->clone_flags,
-	if (handler->pid < 0) {
+			.pidfd = ptr_to_u64(&handler->pidfd),
-		SYSERROR(LXC_CLONE_ERROR);
+			.exit_signal = SIGCHLD,
-		goto out_delete_net;
+		};
+		if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+			cgroup_fd = cgroup_unified_fd(cgroup_ops);
+			if (cgroup_fd >= 0) {
+				handler->clone_flags	|= CLONE_INTO_CGROUP;
+				clone_args.flags	|= CLONE_INTO_CGROUP;
+				clone_args.cgroup	= cgroup_fd;
+			}
+		}
+		/* Try to spawn directly into target cgroup. */
+		handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
+		if (handler->pid < 0) {
+			SYSTRACE("Failed to spawn container directly into target cgroup");
+			/* Kernel might simply be too old for CLONE_INTO_CGROUP. */
+			handler->clone_flags		&= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
+			handler->ns_on_clone_flags	&= ~CLONE_NEWCGROUP;
+			handler->ns_unshare_flags	|= CLONE_NEWCGROUP;
+			clone_args.flags		= handler->clone_flags;
+			handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
+		} else if (cgroup_fd >= 0) {
+			TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
+		}
+		/* Kernel might be too old for clone3(). */
+		if (handler->pid < 0) {
+			SYSTRACE("Failed to spawn container via clone3()");
+			handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+		}
+		if (handler->pid < 0) {
+			SYSERROR(LXC_CLONE_ERROR);
+			goto out_delete_net;
+		}
+		if (handler->pid == 0) {
+			(void)do_start(handler);
+			_exit(EXIT_FAILURE);
+		}
 	}
+	if (handler->pidfd < 0)
+		handler->clone_flags &= ~CLONE_PIDFD;
 	TRACE("Cloned child process %d", handler->pid);
 	/* Verify that we can actually make use of pidfds. */
@@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 	}
 	TRACE("Set up cgroup2 device controller limits");
-	if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+	if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
 		/* Now we're ready to preserve the cgroup namespace */
 		ret = lxc_try_preserve_ns(handler->pid, "cgroup");
 		if (ret < 0) {
@@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 	cgroup_ops->payload_finalize(cgroup_ops);
 	TRACE("Finished setting up cgroups");
-	if (handler->ns_clone_flags & CLONE_NEWTIME) {
+	if (handler->ns_unshare_flags & CLONE_NEWTIME) {
 		/* Now we're ready to preserve the cgroup namespace */
 		ret = lxc_try_preserve_ns(handler->pid, "time");
 		if (ret < 0) {

--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -26,20 +26,18 @@ struct lxc_handler {
 	 *   list the clone flags that were unshare()ed rather then clone()ed
 	 *   because of ordering requirements (e.g. e.g. CLONE_NEWNET and
 	 *   CLONE_NEWUSER) or implementation details.
-         *
-	 * @ns_keep_flags;
-	 * - The clone flags for the namespaces that the container will inherit
-	 *   from the parent. They are not recorded in the handler itself but
-	 *   are present in the container's config.
 	 *
-	 * @ns_share_flags;
+	 * @ns_unshare_flags
-	 * - The clone flags for the namespaces that the container will share
+	 * - Flags for namespaces that were unshared, not cloned.
-	 *   with another process.  They are not recorded in the handler itself
+	 *
-	 *   but are present in the container's config.
+	 * @clone_flags
+	 * - ns_on_clone flags | other flags used to create container.
 	 */
 	struct /* lxc_ns */ {
-		int ns_clone_flags;
+		unsigned int ns_clone_flags;
-		int ns_on_clone_flags;
+		unsigned int ns_on_clone_flags;
+		unsigned int ns_unshare_flags;
+		unsigned int clone_flags;
 	};
 	/* File descriptor to pin the rootfs for privileged containers. */