cgroups: stash host's cgroupfs file descriptor

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>

cgroups: stash host's cgroupfs file descriptor
a0386cd2 · Christian Brauner · a25037c0 · a0386cd2 · a0386cd2 · a0386cd2
Unverified Commit a0386cd2 authored Feb 16, 2021 by Christian Brauner
5 changed files
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -3473,18 +3473,9 @@ static char *cg_unified_get_current_cgroup(bool relative)
 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
 			   bool unprivileged)
 {
-	__do_close int cgroup_root_fd = -EBADF;
 	__do_free char *base_cgroup = NULL, *controllers_path = NULL;
 	__do_free_string_list char **delegatable = NULL;
 	__do_free struct hierarchy *new = NULL;
-	int ret;
-	ret = unified_cgroup_hierarchy();
-	if (ret == -ENOMEDIUM)
-		return ret_errno(ENOMEDIUM);
-	if (ret != CGROUP2_SUPER_MAGIC)
-		return 0;
 	base_cgroup = cg_unified_get_current_cgroup(relative);
 	if (!base_cgroup)
@@ -3492,18 +3483,13 @@ static int cg_unified_init(struct cgroup_ops *ops, bool relative,
 	if (!relative)
 		prune_init_scope(base_cgroup);
-	cgroup_root_fd = openat(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
-				O_NOCTTY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY);
-	if (cgroup_root_fd < 0)
-		return -errno;
 	/*
 	 * We assume that the cgroup we're currently in has been delegated to
 	 * us and we are free to further delege all of the controllers listed
 	 * in cgroup.controllers further down the hierarchy.
 	 */
 	controllers_path = must_make_path_relative(base_cgroup, "cgroup.controllers", NULL);
-	delegatable = cg_unified_get_controllers(cgroup_root_fd, controllers_path);
+	delegatable = cg_unified_get_controllers(ops->dfd_mnt_cgroupfs_host, controllers_path);
 	if (!delegatable)
 		delegatable = cg_unified_make_empty_controller();
 	if (!delegatable[0])
@@ -3538,9 +3524,23 @@ static int cg_unified_init(struct cgroup_ops *ops, bool relative,
 static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
 {
+	__do_close int dfd = -EBADF;
+	bool relative = conf->cgroup_meta.relative;
 	int ret;
 	const char *tmp;
-	bool relative = conf->cgroup_meta.relative;
+	if (ops->dfd_mnt_cgroupfs_host >= 0)
+		return ret_errno(EINVAL);
+	/*
+	 * I don't see the need for allowing symlinks here. If users want to
+	 * have their hierarchy available in different locations I strongly
+	 * suggest bind-mounts.
+	 */
+	dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
+			PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+	if (dfd < 0)
+		return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
 	tmp = lxc_global_config_value("lxc.cgroup.use");
 	if (tmp) {
@@ -3554,14 +3554,23 @@ static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
 			must_append_string(&ops->cgroup_use, cur);
 	}
-	ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
+	/*
-	if (ret < 0)
+	 * Keep dfd referenced by the cleanup function and actually move the fd
-		return -1;
+	 * once we know the initialization succeeded. So if we fail we clean up
+	 * the dfd.
+	 */
+	ops->dfd_mnt_cgroupfs_host = dfd;
-	if (ret == CGROUP2_SUPER_MAGIC)
+	if (unified_cgroup_fd(dfd))
-		return 0;
+		ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
+	else
+		ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
+	if (ret < 0)
+		return syserrno(ret, "Failed to initialize cgroups");
-	return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
+	/* Transfer ownership to cgroup_ops. */
+	move_fd(dfd);
+	return 0;
 }
 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
@@ -3588,6 +3597,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
 		return ret_set_errno(NULL, ENOMEM);
 	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
+	cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
 	if (__cgroup_init(cgfsng_ops, conf))
 		return NULL;

--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -68,6 +68,9 @@ void cgroup_exit(struct cgroup_ops *ops)
 	if (ops->cgroup2_devices)
 		bpf_program_free(ops->cgroup2_devices);
+	if (ops->dfd_mnt_cgroupfs_host >= 0)
+		close(ops->dfd_mnt_cgroupfs_host);
 	for (struct hierarchy **it = ops->hierarchies; it && *it; it++) {
 		for (char **p = (*it)->controllers; p && *p; p++)
 			free(*p);

--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -106,6 +106,18 @@ struct cgroup_ops {
 	/* string constant */
 	const char *version;
+	/*
+	 * File descriptor for the host's cgroupfs mount.  On
+	 * CGROUP_LAYOUT_LEGACY or CGROUP_LAYOUT_HYBRID hybrid systems
+	 * @dfd_mnt_cgroupfs_host will be a tmpfs fd and the individual
+	 * controllers will be cgroupfs fds. On CGROUP_LAYOUT_UNIFIED it will
+	 * be a cgroupfs fd itself.
+	 *
+	 * So for CGROUP_LAYOUT_LEGACY or CGROUP_LAYOUT_HYBRID we allow
+	 * mountpoint crossing iff we cross from a tmpfs into a cgroupfs mount.
+	 * */
+	int dfd_mnt_cgroupfs_host;
 	/* What controllers is the container supposed to use. */
 	char **cgroup_use;
 	char *cgroup_pattern;

--- a/src/lxc/cgroups/cgroup_utils.c
+++ b/src/lxc/cgroups/cgroup_utils.c
@@ -83,22 +83,6 @@ bool test_writeable_v2(char *mountpoint, char *path)
 	return (access(cgroup_threads_file, W_OK) == 0);
 }
-int unified_cgroup_hierarchy(void)
-{
-	int ret;
-	struct statfs fs;
-	ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs);
-	if (ret < 0)
-		return -ENOMEDIUM;
-	if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
-		return CGROUP2_SUPER_MAGIC;
-	return 0;
-}
 int unified_cgroup_fd(int fd)
 {

--- a/src/lxc/cgroups/cgroup_utils.h
+++ b/src/lxc/cgroups/cgroup_utils.h
@@ -29,8 +29,6 @@ __hidden extern bool test_writeable_v1(char *mountpoint, char *path);
 */
 __hidden extern bool test_writeable_v2(char *mountpoint, char *path);
-__hidden extern int unified_cgroup_hierarchy(void);
 __hidden extern int unified_cgroup_fd(int fd);
 static inline bool cgns_supported(void)