cgroups: refactor cgroup handling

This replaces the constructor implementation of cgroup handling with a simpler, thread-safe on-demand model of cgroup driver initialization. Making the cgroup initialization code run in a constructor means that each time the shared library gets mapped the cgroup parsing code gets run. That's unnecessary overhead. It also feels to me that this is only accidently thread-safe because constructors are only run once. But should threads actually end up manipulating or freeing memory that is file-global to cgfsng.c we'd be screwed. Now, I might be wrong here but the cleaner implementation is to allocate a cgroup driver on demand whenever we need it. Take the chance and rework the cgroup_ops interface to make the functions it wants to have implemented a lot cleaner. Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>

cgroups: refactor cgroup handling
2202afc9 · Christian Brauner · b5ead53a · 2202afc9 · 2202afc9 · 2202afc9
Unverified Commit 2202afc9 authored May 18, 2018 by Christian Brauner
13 changed files
--- a/src/lxc/attach.c
+++ b/src/lxc/attach.c
@@ -1272,10 +1272,17 @@ int lxc_attach(const char *name, const char *lxcpath,

 		/* Attach to cgroup, if requested. */
 		if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
-			if (!cgroup_attach(name, lxcpath, pid))
+			struct cgroup_ops *cgroup_ops;
+
+			cgroup_ops = cgroup_init(NULL);
+			if (!cgroup_ops)
+				goto on_error;
+
+			if (!cgroup_ops->attach(cgroup_ops, name, lxcpath, pid))
 				goto on_error;
-			TRACE("Moved intermediate process %d into container's "
-			      "cgroups", pid);
+
+			cgroup_exit(cgroup_ops);
+			TRACE("Moved intermediate process %d into container's cgroups", pid);
 		}

 		/* Setup /proc limits */

--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -60,138 +60,6 @@

 lxc_log_define(lxc_cgfsng, lxc);

-static struct cgroup_ops cgfsng_ops;
-
-/* A descriptor for a mounted hierarchy
- *
- * @controllers
- * - legacy hierarchy
- *   Either NULL, or a null-terminated list of all the co-mounted controllers.
- * - unified hierarchy
- *   Either NULL, or a null-terminated list of all enabled controllers.
- *
- * @mountpoint
- * - The mountpoint we will use.
- * - legacy hierarchy
- *   It will be either /sys/fs/cgroup/controller or
- *   /sys/fs/cgroup/controllerlist.
- * - unified hierarchy
- *   It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name>
- *   depending on whether this is a hybrid cgroup layout (mix of legacy and
- *   unified hierarchies) or a pure unified cgroup layout.
- *
- * @base_cgroup
- * - The cgroup under which the container cgroup path
- *   is created. This will be either the caller's cgroup (if not root), or
- *   init's cgroup (if root).
- *
- * @fullcgpath
- * - The full path to the containers cgroup.
- *
- * @version
- * - legacy hierarchy
- *   If the hierarchy is a legacy hierarchy this will be set to
- *   CGROUP_SUPER_MAGIC.
- * - unified hierarchy
- *   If the hierarchy is a legacy hierarchy this will be set to
- *   CGROUP2_SUPER_MAGIC.
- */
-struct hierarchy {
-	char **controllers;
-	char *mountpoint;
-	char *base_cgroup;
-	char *fullcgpath;
-	int version;
-};
-
-/* The cgroup data which is attached to the lxc_handler.
- *
- * @cgroup_pattern
- * - A copy of lxc.cgroup.pattern.
- *
- * @container_cgroup
- * - If not null, the cgroup which was created for the container. For each
- *   hierarchy, it is created under the @hierarchy->base_cgroup directory.
- *   Relative to the base_cgroup it is the same for all hierarchies.
- *
- * @name
- * - The name of the container.
- *
- * @cgroup_meta
- * - A copy of the container's cgroup information. This overrides
- *   @cgroup_pattern.
- *
- * @cgroup_layout
- * - What cgroup layout the container is running with.
- *   - CGROUP_LAYOUT_UNKNOWN
- *     The cgroup layout could not be determined. This should be treated as an
- *     error condition.
- *   - CGROUP_LAYOUT_LEGACY
- *     The container is running with all controllers mounted into legacy cgroup
- *     hierarchies.
- *   - CGROUP_LAYOUT_HYBRID
- *     The container is running with at least one controller mounted into a
- *     legacy cgroup hierarchy and a mountpoint for the unified hierarchy.  The
- *     unified hierarchy can be empty (no controllers enabled) or non-empty
- *     (controllers enabled).
- *   - CGROUP_LAYOUT_UNIFIED
- *     The container is running on a pure unified cgroup hierarchy. The unified
- *     hierarchy can be empty (no controllers enabled) or non-empty (controllers
- *     enabled).
- */
-struct cgfsng_handler_data {
-	char *cgroup_pattern;
-	char *container_cgroup; /* cgroup we created for the container */
-	char *name; /* container name */
-	/* per-container cgroup information */
-	struct lxc_cgroup cgroup_meta;
-	cgroup_layout_t cgroup_layout;
-};
-
-/* @hierarchies
- * - A NULL-terminated array of struct hierarchy, one per legacy hierarchy. No
- *   duplicates. First sufficient, writeable mounted hierarchy wins.
- */
-struct hierarchy **hierarchies;
-/* Pointer to the unified hierarchy in the null terminated list @hierarchies.
- * This is merely a convenience for hybrid cgroup layouts to easily retrieve the
- * unified hierarchy without iterating throught @hierarchies.
- */
-struct hierarchy *unified;
-/*
- * @cgroup_layout
- * - What cgroup layout the container is running with.
- *   - CGROUP_LAYOUT_UNKNOWN
- *     The cgroup layout could not be determined. This should be treated as an
- *     error condition.
- *   - CGROUP_LAYOUT_LEGACY
- *     The container is running with all controllers mounted into legacy cgroup
- *     hierarchies.
- *   - CGROUP_LAYOUT_HYBRID
- *     The container is running with at least one controller mounted into a
- *     legacy cgroup hierarchy and a mountpoint for the unified hierarchy.  The
- *     unified hierarchy can be empty (no controllers enabled) or non-empty
- *     (controllers enabled).
- *   - CGROUP_LAYOUT_UNIFIED
- *     The container is running on a pure unified cgroup hierarchy. The unified
- *     hierarchy can be empty (no controllers enabled) or non-empty (controllers
- *     enabled).
- */
-cgroup_layout_t cgroup_layout;
-/* What controllers is the container supposed to use. */
-char *cgroup_use;
-
-/* @lxc_cgfsng_debug
- * - Whether to print debug info to stdout for the cgfsng driver.
- */
-static bool lxc_cgfsng_debug;
-
-#define CGFSNG_DEBUG(format, ...)                                              \
-	do {                                                                   \
-		if (lxc_cgfsng_debug)                                          \
-			printf("cgfsng: " format, ##__VA_ARGS__);              \
-	} while (0)
-
 static void free_string_list(char **clist)
 {
 	int i;
@@ -298,40 +166,28 @@ static void must_append_controller(char **klist, char **nlist, char ***clist,
 	(*clist)[newentry] = copy;
 }

-static void free_handler_data(struct cgfsng_handler_data *d)
-{
-	free(d->cgroup_pattern);
-	free(d->container_cgroup);
-	free(d->name);
-	if (d->cgroup_meta.dir)
-		free(d->cgroup_meta.dir);
-	if (d->cgroup_meta.controllers)
-		free(d->cgroup_meta.controllers);
-	free(d);
-}
-
 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 * @c, or NULL if there is none.
 */
-struct hierarchy *get_hierarchy(const char *c)
+struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *c)
 {
 	int i;

-	if (!hierarchies)
+	if (!ops->hierarchies)
 		return NULL;

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; ops->hierarchies[i]; i++) {
 		if (!c) {
 			/* This is the empty unified hierarchy. */
-			if (hierarchies[i]->controllers &&
-			    !hierarchies[i]->controllers[0])
-				return hierarchies[i];
+			if (ops->hierarchies[i]->controllers &&
+			    !ops->hierarchies[i]->controllers[0])
+				return ops->hierarchies[i];

 			continue;
 		}

-		if (string_in_list(hierarchies[i]->controllers, c))
-			return hierarchies[i];
+		if (string_in_list(ops->hierarchies[i]->controllers, c))
+			return ops->hierarchies[i];
 	}

 	return NULL;
@@ -829,23 +685,23 @@ static bool controller_found(struct hierarchy **hlist, char *entry)
 /* Return true if all of the controllers which we require have been found.  The
 * required list is  freezer and anything in lxc.cgroup.use.
 */
-static bool all_controllers_found(void)
+static bool all_controllers_found(struct cgroup_ops *ops)
 {
 	char *p;
 	char *saveptr = NULL;
-	struct hierarchy **hlist = hierarchies;
+	struct hierarchy **hlist = ops->hierarchies;

 	if (!controller_found(hlist, "freezer")) {
-		CGFSNG_DEBUG("No freezer controller mountpoint found\n");
+		ERROR("No freezer controller mountpoint found");
 		return false;
 	}

-	if (!cgroup_use)
+	if (!ops->cgroup_use)
 		return true;

-	for (; (p = strtok_r(cgroup_use, ",", &saveptr)); cgroup_use = NULL)
+	for (; (p = strtok_r(ops->cgroup_use, ",", &saveptr)); ops->cgroup_use = NULL)
 		if (!controller_found(hlist, p)) {
-			CGFSNG_DEBUG("No %s controller mountpoint found\n", p);
+			ERROR("No %s controller mountpoint found", p);
 			return false;
 		}

@@ -879,14 +735,14 @@ static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 	 * verify /sys/fs/cgroup/ in this field.
 	 */
 	if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
-		CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
+		ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
 		return NULL;
 	}

 	p += 15;
 	p2 = strchr(p, ' ');
 	if (!p2) {
-		CGFSNG_DEBUG("Corrupt mountinfo\n");
+		ERROR("Corrupt mountinfo");
 		return NULL;
 	}
 	*p2 = '\0';
@@ -944,7 +800,7 @@ static char **cg_unified_get_controllers(const char *file)
 	return aret;
 }

-static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
+static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 				       char *base_cgroup, int type)
 {
 	struct hierarchy *new;
@@ -957,8 +813,8 @@ static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
 	new->fullcgpath = NULL;
 	new->version = type;

-	newentry = append_null_to_list((void ***)&hierarchies);
-	hierarchies[newentry] = new;
+	newentry = append_null_to_list((void ***)h);
+	(*h)[newentry] = new;
 	return new;
 }

@@ -1137,39 +993,26 @@ static void trim(char *s)
 		s[--len] = '\0';
 }

-static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
-{
-	printf("Cgroup information:\n");
-	printf("  container name: %s\n", d->name ? d->name : "(null)");
-	printf("  lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
-	printf("  lxc.cgroup.pattern: %s\n",
-	       d->cgroup_pattern ? d->cgroup_pattern : "(null)");
-	printf("  lxc.cgroup.dir: %s\n",
-	       d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
-	printf("  cgroup: %s\n",
-	       d->container_cgroup ? d->container_cgroup : "(null)");
-}
-
-static void lxc_cgfsng_print_hierarchies()
+static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
 {
 	int i;
 	struct hierarchy **it;

-	if (!hierarchies) {
-		printf("  No hierarchies found\n");
+	if (!ops->hierarchies) {
+		TRACE("  No hierarchies found");
 		return;
 	}

-	printf("  Hierarchies:\n");
-	for (i = 0, it = hierarchies; it && *it; it++, i++) {
+	TRACE("  Hierarchies:");
+	for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
 		int j;
 		char **cit;

-		printf("  %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
-		printf("      mountpoint:  %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
-		printf("      controllers:\n");
+		TRACE("  %d: base_cgroup: %s", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
+		TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
+		TRACE("      controllers:");
 		for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
-			printf("      %d: %s\n", j, *cit);
+			TRACE("      %d: %s", j, *cit);
 	}
 }

@@ -1179,491 +1022,155 @@ static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
 	int k;
 	char **it;

-	printf("basecginfo is:\n");
-	printf("%s\n", basecginfo);
+	TRACE("basecginfo is:");
+	TRACE("%s", basecginfo);

 	for (k = 0, it = klist; it && *it; it++, k++)
-		printf("kernel subsystem %d: %s\n", k, *it);
+		TRACE("kernel subsystem %d: %s", k, *it);

 	for (k = 0, it = nlist; it && *it; it++, k++)
-		printf("named subsystem %d: %s\n", k, *it);
+		TRACE("named subsystem %d: %s", k, *it);
 }

-static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
-{
-	lxc_cgfsng_print_handler_data(d);
-	lxc_cgfsng_print_hierarchies();
-}
-
-/* At startup, parse_hierarchies finds all the info we need about cgroup
- * mountpoints and current cgroups, and stores it in @d.
- */
-static bool cg_hybrid_init(void)
+static int recursive_destroy(char *dirname)
 {
 	int ret;
-	char *basecginfo;
-	bool will_escape;
-	FILE *f;
-	size_t len = 0;
-	char *line = NULL;
-	char **klist = NULL, **nlist = NULL;
-
-	/* Root spawned containers escape the current cgroup, so use init's
-	 * cgroups as our base in that case.
-	 */
-	will_escape = (geteuid() == 0);
-	if (will_escape)
-		basecginfo = read_file("/proc/1/cgroup");
-	else
-		basecginfo = read_file("/proc/self/cgroup");
-	if (!basecginfo)
-		return false;
-
-	ret = get_existing_subsystems(&klist, &nlist);
-	if (ret < 0) {
-		CGFSNG_DEBUG("Failed to retrieve available legacy cgroup controllers\n");
-		free(basecginfo);
-		return false;
-	}
-
-	f = fopen("/proc/self/mountinfo", "r");
-	if (!f) {
-		CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
-		free(basecginfo);
-		return false;
-	}
-
-	if (lxc_cgfsng_debug)
-		lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
-
-	while (getline(&line, &len, f) != -1) {
-		int type;
-		bool writeable;
-		struct hierarchy *new;
-		char *base_cgroup = NULL, *mountpoint = NULL;
-		char **controller_list = NULL;
-
-		type = get_cgroup_version(line);
-		if (type == 0)
-			continue;
+	struct dirent *direntp;
+	DIR *dir;
+	int r = 0;

-		if (type == CGROUP2_SUPER_MAGIC && unified)
-			continue;
+	dir = opendir(dirname);
+	if (!dir)
+		return -1;

-		if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
-			if (type == CGROUP2_SUPER_MAGIC)
-				cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-			else if (type == CGROUP_SUPER_MAGIC)
-				cgroup_layout = CGROUP_LAYOUT_LEGACY;
-		} else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
-			if (type == CGROUP_SUPER_MAGIC)
-				cgroup_layout = CGROUP_LAYOUT_HYBRID;
-		} else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
-			if (type == CGROUP2_SUPER_MAGIC)
-				cgroup_layout = CGROUP_LAYOUT_HYBRID;
-		}
+	while ((direntp = readdir(dir))) {
+		char *pathname;
+		struct stat mystat;

-		controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
-		if (!controller_list && type == CGROUP_SUPER_MAGIC)
+		if (!strcmp(direntp->d_name, ".") ||
+		    !strcmp(direntp->d_name, ".."))
 			continue;

-		if (type == CGROUP_SUPER_MAGIC)
-			if (controller_list_is_dup(hierarchies, controller_list))
-				goto next;
-
-		mountpoint = cg_hybrid_get_mountpoint(line);
-		if (!mountpoint) {
-			CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
-			goto next;
-		}
+		pathname = must_make_path(dirname, direntp->d_name, NULL);

-		if (type == CGROUP_SUPER_MAGIC)
-			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
-		else
-			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
-		if (!base_cgroup) {
-			CGFSNG_DEBUG("Failed to find current cgroup\n");
+		ret = lstat(pathname, &mystat);
+		if (ret < 0) {
+			if (!r)
+				WARN("Failed to stat \"%s\"", pathname);
+			r = -1;
 			goto next;
 		}

-		trim(base_cgroup);
-		prune_init_scope(base_cgroup);
-		if (type == CGROUP2_SUPER_MAGIC)
-			writeable = test_writeable_v2(mountpoint, base_cgroup);
-		else
-			writeable = test_writeable_v1(mountpoint, base_cgroup);
-		if (!writeable)
+		if (!S_ISDIR(mystat.st_mode))
 			goto next;

-		if (type == CGROUP2_SUPER_MAGIC) {
-			char *cgv2_ctrl_path;
-
-			cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
-							"cgroup.controllers",
-							NULL);
-
-			controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
-			free(cgv2_ctrl_path);
-			if (!controller_list) {
-				controller_list = cg_unified_make_empty_controller();
-				CGFSNG_DEBUG("No controllers are enabled for "
-					     "delegation in the unified hierarchy\n");
-			}
-		}
-
-		new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
-		if (type == CGROUP2_SUPER_MAGIC && !unified)
-			unified = new;
-
-		continue;
-
+		ret = recursive_destroy(pathname);
+		if (ret < 0)
+			r = -1;
 	next:
-		free_string_list(controller_list);
-		free(mountpoint);
-		free(base_cgroup);
+		free(pathname);
 	}

-	free_string_list(klist);
-	free_string_list(nlist);
-
-	free(basecginfo);
-
-	fclose(f);
-	free(line);
-
-	if (lxc_cgfsng_debug) {
-		printf("Writable cgroup hierarchies:\n");
-		lxc_cgfsng_print_hierarchies();
+	ret = rmdir(dirname);
+	if (ret < 0) {
+		if (!r)
+			WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
+		r = -1;
 	}

-	/* verify that all controllers in cgroup.use and all crucial
-	 * controllers are accounted for
-	 */
-	if (!all_controllers_found())
-		return false;
+	ret = closedir(dir);
+	if (ret < 0) {
+		if (!r)
+			WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
+		r = -1;
+	}

-	return true;
+	return r;
 }

-static int cg_is_pure_unified(void)
+static int cgroup_rmdir(struct hierarchy **hierarchies,
+			const char *container_cgroup)
 {
+	int i;

-	int ret;
-	struct statfs fs;
+	if (!container_cgroup || !hierarchies)
+		return 0;

-	ret = statfs("/sys/fs/cgroup", &fs);
-	if (ret < 0)
-		return -ENOMEDIUM;
+	for (i = 0; hierarchies[i]; i++) {
+		int ret;
+		struct hierarchy *h = hierarchies[i];

-	if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
-		return CGROUP2_SUPER_MAGIC;
+		if (!h->fullcgpath)
+			continue;
+
+		ret = recursive_destroy(h->fullcgpath);
+		if (ret < 0)
+			WARN("Failed to destroy \"%s\"", h->fullcgpath);
+
+		free(h->fullcgpath);
+		h->fullcgpath = NULL;
+	}

 	return 0;
 }

-/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
-static char *cg_unified_get_current_cgroup(void)
-{
-	char *basecginfo, *base_cgroup;
-	bool will_escape;
-	char *copy = NULL;
+struct generic_userns_exec_data {
+	struct hierarchy **hierarchies;
+	const char *container_cgroup;
+	struct lxc_conf *conf;
+	uid_t origuid; /* target uid in parent namespace */
+	char *path;
+};

-	will_escape = (geteuid() == 0);
-	if (will_escape)
-		basecginfo = read_file("/proc/1/cgroup");
-	else
-		basecginfo = read_file("/proc/self/cgroup");
-	if (!basecginfo)
-		return NULL;
+static int cgroup_rmdir_wrapper(void *data)
+{
+	int ret;
+	struct generic_userns_exec_data *arg = data;
+	uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
+	gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;

-	base_cgroup = strstr(basecginfo, "0::/");
-	if (!base_cgroup)
-		goto cleanup_on_err;
+	ret = setresgid(nsgid, nsgid, nsgid);
+	if (ret < 0) {
+		SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
+			 (int)nsgid, (int)nsgid);
+		return -1;
+	}

-	base_cgroup = base_cgroup + 3;
-	copy = copy_to_eol(base_cgroup);
-	if (!copy)
-		goto cleanup_on_err;
+	ret = setresuid(nsuid, nsuid, nsuid);
+	if (ret < 0) {
+		SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
+			 (int)nsuid, (int)nsuid);
+		return -1;
+	}

-cleanup_on_err:
-	free(basecginfo);
-	if (copy)
-		trim(copy);
+	ret = setgroups(0, NULL);
+	if (ret < 0 && errno != EPERM) {
+		SYSERROR("Failed to setgroups(0, NULL)");
+		return -1;
+	}

-	return copy;
+	return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
 }

-static int cg_unified_init(void)
+static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
 {
 	int ret;
-	char *mountpoint, *subtree_path;
-	char **delegatable;
-	char *base_cgroup = NULL;
-
-	ret = cg_is_pure_unified();
-	if (ret == -ENOMEDIUM)
-		return -ENOMEDIUM;
-
-	if (ret != CGROUP2_SUPER_MAGIC)
-		return 0;
-
-	base_cgroup = cg_unified_get_current_cgroup();
-	if (!base_cgroup)
-		return -EINVAL;
-	prune_init_scope(base_cgroup);
-
-	/* We assume that we have already been given controllers to delegate
-	 * further down the hierarchy. If not it is up to the user to delegate
-	 * them to us.
-	 */
-	mountpoint = must_copy_string("/sys/fs/cgroup");
-	subtree_path = must_make_path(mountpoint, base_cgroup,
-				      "cgroup.subtree_control", NULL);
-	delegatable = cg_unified_get_controllers(subtree_path);
-	free(subtree_path);
-	if (!delegatable)
-		delegatable = cg_unified_make_empty_controller();
-	if (!delegatable[0])
-		CGFSNG_DEBUG("No controllers are enabled for delegation\n");
-
-	/* TODO: If the user requested specific controllers via lxc.cgroup.use
-	 * we should verify here. The reason I'm not doing it right is that I'm
-	 * not convinced that lxc.cgroup.use will be the future since it is a
-	 * global property. I much rather have an option that lets you request
-	 * controllers per container.
-	 */
-
-	add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
-	unified = hierarchies[0];
-
-	cgroup_layout = CGROUP_LAYOUT_UNIFIED;
-	return CGROUP2_SUPER_MAGIC;
-}
-
-static bool cg_init(void)
-{
-	int ret;
-	const char *tmp;
-
-	errno = 0;
-	tmp = lxc_global_config_value("lxc.cgroup.use");
-	if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
-		CGFSNG_DEBUG("Failed to retrieve list of cgroups to use\n");
-		return false;
-	}
-	cgroup_use = must_copy_string(tmp);
-
-	ret = cg_unified_init();
-	if (ret < 0)
-		return false;
-
-	if (ret == CGROUP2_SUPER_MAGIC)
-		return true;
-
-	return cg_hybrid_init();
-}
-
-static void *cgfsng_init(struct lxc_handler *handler)
-{
-	const char *cgroup_pattern;
-	struct cgfsng_handler_data *d;
-
-	d = must_alloc(sizeof(*d));
-	memset(d, 0, sizeof(*d));
-
-	/* copy container name */
-	d->name = must_copy_string(handler->name);
-
-	/* copy per-container cgroup information */
-	d->cgroup_meta.dir = NULL;
-	d->cgroup_meta.controllers = NULL;
-	if (handler->conf) {
-		d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
-		d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
-	}
-
-	/* copy system-wide cgroup information */
-	cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
-	if (!cgroup_pattern) {
-		/* lxc.cgroup.pattern is only NULL on error. */
-		ERROR("Failed to retrieve cgroup pattern");
-		goto out_free;
-	}
-	d->cgroup_pattern = must_copy_string(cgroup_pattern);
-
-	d->cgroup_layout = cgroup_layout;
-	if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
-		TRACE("Running with legacy cgroup layout");
-	else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
-		TRACE("Running with hybrid cgroup layout");
-	else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
-		TRACE("Running with unified cgroup layout");
-	else
-		WARN("Running with unknown cgroup layout");
-
-	if (lxc_cgfsng_debug)
-		lxc_cgfsng_print_debuginfo(d);
-
-	return d;
-
-out_free:
-	free_handler_data(d);
-	return NULL;
-}
-
-static int recursive_destroy(char *dirname)
-{
-	int ret;
-	struct dirent *direntp;
-	DIR *dir;
-	int r = 0;
-
-	dir = opendir(dirname);
-	if (!dir)
-		return -1;
-
-	while ((direntp = readdir(dir))) {
-		char *pathname;
-		struct stat mystat;
-
-		if (!strcmp(direntp->d_name, ".") ||
-		    !strcmp(direntp->d_name, ".."))
-			continue;
-
-		pathname = must_make_path(dirname, direntp->d_name, NULL);
-
-		ret = lstat(pathname, &mystat);
-		if (ret < 0) {
-			if (!r)
-				WARN("Failed to stat \"%s\"", pathname);
-			r = -1;
-			goto next;
-		}
-
-		if (!S_ISDIR(mystat.st_mode))
-			goto next;
-
-		ret = recursive_destroy(pathname);
-		if (ret < 0)
-			r = -1;
-	next:
-		free(pathname);
-	}
-
-	ret = rmdir(dirname);
-	if (ret < 0) {
-		if (!r)
-			WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
-		r = -1;
-	}
-
-	ret = closedir(dir);
-	if (ret < 0) {
-		if (!r)
-			WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
-		r = -1;
-	}
-
-	return r;
-}
-
-static int cgroup_rmdir(char *container_cgroup)
-{
-	int i;
-
-	if (!container_cgroup || !hierarchies)
-		return 0;
-
-	for (i = 0; hierarchies[i]; i++) {
-		int ret;
-		struct hierarchy *h = hierarchies[i];
-
-		if (!h->fullcgpath)
-			continue;
-
-		ret = recursive_destroy(h->fullcgpath);
-		if (ret < 0)
-			WARN("Failed to destroy \"%s\"", h->fullcgpath);
-
-		free(h->fullcgpath);
-		h->fullcgpath = NULL;
-	}
-
-	return 0;
-}
-
-struct generic_userns_exec_data {
-	struct cgfsng_handler_data *d;
-	struct lxc_conf *conf;
-	uid_t origuid; /* target uid in parent namespace */
-	char *path;
-};
-
-static int cgroup_rmdir_wrapper(void *data)
-{
-	int ret;
-	struct generic_userns_exec_data *arg = data;
-	uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
-	gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
-
-	ret = setresgid(nsgid, nsgid, nsgid);
-	if (ret < 0) {
-		SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
-			 (int)nsgid, (int)nsgid);
-		return -1;
-	}
-
-	ret = setresuid(nsuid, nsuid, nsuid);
-	if (ret < 0) {
-		SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
-			 (int)nsuid, (int)nsuid);
-		return -1;
-	}
-
-	ret = setgroups(0, NULL);
-	if (ret < 0 && errno != EPERM) {
-		SYSERROR("Failed to setgroups(0, NULL)");
-		return -1;
-	}
-
-	return cgroup_rmdir(arg->d->container_cgroup);
-}
-
-static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
-{
-	int ret;
-	struct cgfsng_handler_data *d = hdata;
-	struct generic_userns_exec_data wrap;
-
-	if (!d)
-		return;
+	struct generic_userns_exec_data wrap;

 	wrap.origuid = 0;
-	wrap.d = hdata;
-	wrap.conf = conf;
+	wrap.container_cgroup = ops->container_cgroup;
+	wrap.hierarchies = ops->hierarchies;
+	wrap.conf = handler->conf;

-	if (conf && !lxc_list_empty(&conf->id_map))
-		ret = userns_exec_1(conf, cgroup_rmdir_wrapper, &wrap,
+	if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
+		ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
 				    "cgroup_rmdir_wrapper");
 	else
-		ret = cgroup_rmdir(d->container_cgroup);
+		ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
 	if (ret < 0) {
 		WARN("Failed to destroy cgroups");
 		return;
 	}
-
-	free_handler_data(d);
-}
-
-struct cgroup_ops *cgfsng_ops_init(void)
-{
-	if (getenv("LXC_DEBUG_CGFSNG"))
-		lxc_cgfsng_debug = true;
-
-	if (!cg_init())
-		return NULL;
-
-	return &cgfsng_ops;
 }

 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
@@ -1769,26 +1276,28 @@ static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
 * next cgroup_pattern-1, -2, ..., -999.
 */
-static inline bool cgfsng_create(void *hdata)
+static inline bool cgfsng_create(struct cgroup_ops *ops,
+				 struct lxc_handler *handler)
 {
 	int i;
 	size_t len;
 	char *container_cgroup, *offset, *tmp;
 	int idx = 0;
-	struct cgfsng_handler_data *d = hdata;
+	struct lxc_conf *conf = handler->conf;
+	const char *join_args[] = {conf->cgroup_meta.dir, handler->name, NULL};

-	if (!d)
+	if (ops->container_cgroup) {
+		WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
 		return false;
+	}

-	if (d->container_cgroup) {
-		WARN("cgfsng_create called a second time");
+	if (!conf)
 		return false;
-	}

-	if (d->cgroup_meta.dir)
-		tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
+	if (conf->cgroup_meta.dir)
+		tmp = lxc_string_join("/", join_args, false);
 	else
-		tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
+		tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
 	if (!tmp) {
 		ERROR("Failed expanding cgroup name pattern");
 		return false;
@@ -1820,20 +1329,20 @@ again:
 		}
 	}

-	for (i = 0; hierarchies[i]; i++) {
-		if (!create_path_for_hierarchy(hierarchies[i], container_cgroup)) {
+	for (i = 0; ops->hierarchies[i]; i++) {
+		if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
 			int j;
-			ERROR("Failed to create cgroup \"%s\"", hierarchies[i]->fullcgpath);
-			free(hierarchies[i]->fullcgpath);
-			hierarchies[i]->fullcgpath = NULL;
+			ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->fullcgpath);
+			free(ops->hierarchies[i]->fullcgpath);
+			ops->hierarchies[i]->fullcgpath = NULL;
 			for (j = 0; j < i; j++)
-				remove_path_for_hierarchy(hierarchies[j], container_cgroup);
+				remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup);
 			idx++;
 			goto again;
 		}
 	}

-	d->container_cgroup = container_cgroup;
+	ops->container_cgroup = container_cgroup;

 	return true;

@@ -1843,7 +1352,7 @@ out_free:
 	return false;
 }

-static bool cgfsng_enter(void *hdata, pid_t pid)
+static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
 {
 	int i, len;
 	char pidstr[25];
@@ -1852,11 +1361,11 @@ static bool cgfsng_enter(void *hdata, pid_t pid)
 	if (len < 0 || len >= 25)
 		return false;

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; ops->hierarchies[i]; i++) {
 		int ret;
 		char *fullpath;

-		fullpath = must_make_path(hierarchies[i]->fullcgpath,
+		fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
 					  "cgroup.procs", NULL);
 		ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
 		if (ret != 0) {
@@ -1929,9 +1438,9 @@ static int chown_cgroup_wrapper(void *data)

 	destuid = get_ns_uid(arg->origuid);

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; arg->hierarchies[i]; i++) {
 		char *fullpath;
-		char *path = hierarchies[i]->fullcgpath;
+		char *path = arg->hierarchies[i]->fullcgpath;

 		ret = chowmod(path, destuid, nsgid, 0775);
 		if (ret < 0)
@@ -1944,17 +1453,17 @@ static int chown_cgroup_wrapper(void *data)
 		 * files (which systemd in wily insists on doing).
 		 */

-		if (hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
+		if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
 			fullpath = must_make_path(path, "tasks", NULL);
 			(void)chowmod(fullpath, destuid, nsgid, 0664);
 			free(fullpath);
 		}

 		fullpath = must_make_path(path, "cgroup.procs", NULL);
-		(void)chowmod(fullpath, destuid, 0, 0664);
+		(void)chowmod(fullpath, destuid, nsgid, 0664);
 		free(fullpath);

-		if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
+		if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
 			continue;

 		fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
@@ -1969,20 +1478,16 @@ static int chown_cgroup_wrapper(void *data)
 	return 0;
 }

-static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
+static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
 {
-	struct cgfsng_handler_data *d = hdata;
 	struct generic_userns_exec_data wrap;

-	if (!d)
-		return false;
-
 	if (lxc_list_empty(&conf->id_map))
 		return true;

 	wrap.origuid = geteuid();
 	wrap.path = NULL;
-	wrap.d = d;
+	wrap.hierarchies = ops->hierarchies;
 	wrap.conf = conf;

 	if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
@@ -2122,13 +1627,12 @@ static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
 	return __cg_mount_direct(type, h, controllerpath);
 }

-static bool cgfsng_mount(void *hdata, const char *root, int type)
+static bool cgfsng_mount(struct cgroup_ops *ops, struct lxc_handler *handler,
+			 const char *root, int type)
 {
 	int i, ret;
 	char *tmpfspath = NULL;
 	bool has_cgns = false, retval = false, wants_force_mount = false;
-	struct lxc_handler *handler = hdata;
-	struct cgfsng_handler_data *d = handler->cgroup_data;

 	if ((type & LXC_AUTO_CGROUP_MASK) == 0)
 		return true;
@@ -2162,9 +1666,9 @@ static bool cgfsng_mount(void *hdata, const char *root, int type)
 	if (ret < 0)
 		goto on_error;

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; ops->hierarchies[i]; i++) {
 		char *controllerpath, *path2;
-		struct hierarchy *h = hierarchies[i];
+		struct hierarchy *h = ops->hierarchies[i];
 		char *controller = strrchr(h->mountpoint, '/');

 		if (!controller)
@@ -2209,7 +1713,7 @@ static bool cgfsng_mount(void *hdata, const char *root, int type)
 		}

 		path2 = must_make_path(controllerpath, h->base_cgroup,
-				       d->container_cgroup, NULL);
+				       ops->container_cgroup, NULL);
 		ret = mkdir_p(path2, 0755);
 		if (ret < 0) {
 			free(controllerpath);
@@ -2218,7 +1722,7 @@ static bool cgfsng_mount(void *hdata, const char *root, int type)
 		}

 		ret = cg_legacy_mount_controllers(type, h, controllerpath,
-						  path2, d->container_cgroup);
+						  path2, ops->container_cgroup);
 		free(controllerpath);
 		free(path2);
 		if (ret < 0)
@@ -2276,35 +1780,34 @@ static int recursive_count_nrtasks(char *dirname)
 	return count;
 }

-static int cgfsng_nrtasks(void *hdata)
+static int cgfsng_nrtasks(struct cgroup_ops *ops)
 {
 	int count;
 	char *path;
-	struct cgfsng_handler_data *d = hdata;

-	if (!d || !d->container_cgroup || !hierarchies)
+	if (!ops->container_cgroup || !ops->hierarchies)
 		return -1;

-	path = must_make_path(hierarchies[0]->fullcgpath, NULL);
+	path = must_make_path(ops->hierarchies[0]->fullcgpath, NULL);
 	count = recursive_count_nrtasks(path);
 	free(path);
 	return count;
 }

 /* Only root needs to escape to the cgroup of its init. */
-static bool cgfsng_escape()
+static bool cgfsng_escape(const struct cgroup_ops *ops)
 {
 	int i;

 	if (geteuid())
 		return true;

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; ops->hierarchies[i]; i++) {
 		int ret;
 		char *fullpath;

-		fullpath = must_make_path(hierarchies[i]->mountpoint,
-					  hierarchies[i]->base_cgroup,
+		fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
+					  ops->hierarchies[i]->base_cgroup,
 					  "cgroup.procs", NULL);
 		ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
 		if (ret != 0) {
@@ -2318,26 +1821,26 @@ static bool cgfsng_escape()
 	return true;
 }

-static int cgfsng_num_hierarchies(void)
+static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
 {
 	int i;

-	for (i = 0; hierarchies[i]; i++)
+	for (i = 0; ops->hierarchies[i]; i++)
 		;

 	return i;
 }

-static bool cgfsng_get_hierarchies(int n, char ***out)
+static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
 {
 	int i;

 	/* sanity check n */
 	for (i = 0; i < n; i++)
-		if (!hierarchies[i])
+		if (!ops->hierarchies[i])
 			return false;

-	*out = hierarchies[i]->controllers;
+	*out = ops->hierarchies[i]->controllers;

 	return true;
 }
@@ -2348,13 +1851,13 @@ static bool cgfsng_get_hierarchies(int n, char ***out)
 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
 * to be adapted.
 */
-static bool cgfsng_unfreeze(void *hdata)
+static bool cgfsng_unfreeze(struct cgroup_ops *ops)
 {
 	int ret;
 	char *fullpath;
 	struct hierarchy *h;

-	h = get_hierarchy("freezer");
+	h = get_hierarchy(ops, "freezer");
 	if (!h)
 		return false;

@@ -2367,14 +1870,15 @@ static bool cgfsng_unfreeze(void *hdata)
 	return true;
 }

-static const char *cgfsng_get_cgroup(void *hdata, const char *controller)
+static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
+				     const char *controller)
 {
 	struct hierarchy *h;

-	h = get_hierarchy(controller);
+	h = get_hierarchy(ops, controller);
 	if (!h) {
-		SYSERROR("Failed to find hierarchy for controller \"%s\"",
-			 controller ? controller : "(null)");
+		WARN("Failed to find hierarchy for controller \"%s\"",
+		     controller ? controller : "(null)");
 		return NULL;
 	}

@@ -2465,7 +1969,8 @@ on_error:
 	return fret;
 }

-static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
+static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
+			  const char *lxcpath, pid_t pid)
 {
 	int i, len, ret;
 	char pidstr[25];
@@ -2474,10 +1979,10 @@ static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
 	if (len < 0 || len >= 25)
 		return false;

-	for (i = 0; hierarchies[i]; i++) {
+	for (i = 0; ops->hierarchies[i]; i++) {
 		char *path;
 		char *fullpath = NULL;
-		struct hierarchy *h = hierarchies[i];
+		struct hierarchy *h = ops->hierarchies[i];

 		if (h->version == CGROUP2_SUPER_MAGIC) {
 			ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
@@ -2511,8 +2016,8 @@ static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
 * don't have a cgroup_data set up, so we ask the running container through the
 * commands API for the cgroup path.
 */
-static int cgfsng_get(const char *filename, char *value, size_t len,
-		      const char *name, const char *lxcpath)
+static int cgfsng_get(struct cgroup_ops *ops, const char *filename, char *value,
+		      size_t len, const char *name, const char *lxcpath)
 {
 	int ret = -1;
 	size_t controller_len;
@@ -2531,7 +2036,7 @@ static int cgfsng_get(const char *filename, char *value, size_t len,
 	if (!path)
 		return -1;

-	h = get_hierarchy(controller);
+	h = get_hierarchy(ops, controller);
 	if (h) {
 		char *fullpath;

@@ -2548,8 +2053,8 @@ static int cgfsng_get(const char *filename, char *value, size_t len,
 * don't have a cgroup_data set up, so we ask the running container through the
 * commands API for the cgroup path.
 */
-static int cgfsng_set(const char *filename, const char *value, const char *name,
-		      const char *lxcpath)
+static int cgfsng_set(struct cgroup_ops *ops, const char *filename,
+		      const char *value, const char *name, const char *lxcpath)
 {
 	int ret = -1;
 	size_t controller_len;
@@ -2568,7 +2073,7 @@ static int cgfsng_set(const char *filename, const char *value, const char *name,
 	if (!path)
 		return -1;

-	h = get_hierarchy(controller);
+	h = get_hierarchy(ops, controller);
 	if (h) {
 		char *fullpath;

@@ -2662,8 +2167,8 @@ out:
 /* Called from setup_limits - here we have the container's cgroup_data because
 * we created the cgroups.
 */
-static int cg_legacy_set_data(const char *filename, const char *value,
-			      struct cgfsng_handler_data *d)
+static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
+			      const char *value)
 {
 	size_t len;
 	char *fullpath, *p;
@@ -2687,7 +2192,7 @@ static int cg_legacy_set_data(const char *filename, const char *value,
 		value = converted_value;
 	}

-	h = get_hierarchy(controller);
+	h = get_hierarchy(ops, controller);
 	if (!h) {
 		ERROR("Failed to setup limits for the \"%s\" controller. "
 		      "The controller seems to be unused by \"cgfsng\" cgroup "
@@ -2703,13 +2208,12 @@ static int cg_legacy_set_data(const char *filename, const char *value,
 	return ret;
 }

-static bool __cg_legacy_setup_limits(void *hdata,
+static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
 				     struct lxc_list *cgroup_settings,
 				     bool do_devices)
 {
 	struct lxc_list *iterator, *next, *sorted_cgroup_settings;
 	struct lxc_cgroup *cg;
-	struct cgfsng_handler_data *d = hdata;
 	bool ret = false;

 	if (lxc_list_empty(cgroup_settings))
@@ -2723,7 +2227,7 @@ static bool __cg_legacy_setup_limits(void *hdata,
 		cg = iterator->elem;

 		if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
-			if (cg_legacy_set_data(cg->subsystem, cg->value, d)) {
+			if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
 				if (do_devices && (errno == EACCES || errno == EPERM)) {
 					WARN("Failed to set \"%s\" to \"%s\"",
 					     cg->subsystem, cg->value);
@@ -2749,11 +2253,11 @@ out:
 	return ret;
 }

-static bool __cg_unified_setup_limits(void *hdata,
+static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
 				      struct lxc_list *cgroup_settings)
 {
 	struct lxc_list *iterator;
-	struct hierarchy *h = unified;
+	struct hierarchy *h = ops->unified;

 	if (lxc_list_empty(cgroup_settings))
 		return true;
@@ -2781,35 +2285,328 @@ static bool __cg_unified_setup_limits(void *hdata,
 	return true;
 }

-static bool cgfsng_setup_limits(void *hdata, struct lxc_conf *conf,
+static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf,
 				bool do_devices)
 {
 	bool bret;

-	bret = __cg_legacy_setup_limits(hdata, &conf->cgroup, do_devices);
+	bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
 	if (!bret)
 		return false;

-	return __cg_unified_setup_limits(hdata, &conf->cgroup2);
-}
-
-static struct cgroup_ops cgfsng_ops = {
-	.init = cgfsng_init,
-	.destroy = cgfsng_destroy,
-	.create = cgfsng_create,
-	.enter = cgfsng_enter,
-	.escape = cgfsng_escape,
-	.num_hierarchies = cgfsng_num_hierarchies,
-	.get_hierarchies = cgfsng_get_hierarchies,
-	.get_cgroup = cgfsng_get_cgroup,
-	.get = cgfsng_get,
-	.set = cgfsng_set,
-	.unfreeze = cgfsng_unfreeze,
-	.setup_limits = cgfsng_setup_limits,
-	.driver = "cgfsng",
-	.version = "1.0.0",
-	.attach = cgfsng_attach,
-	.chown = cgfsng_chown,
-	.mount_cgroup = cgfsng_mount,
-	.nrtasks = cgfsng_nrtasks,
-};
+	return __cg_unified_setup_limits(ops, &conf->cgroup2);
+}
+
+/* At startup, parse_hierarchies finds all the info we need about cgroup
+ * mountpoints and current cgroups, and stores it in @d.
+ */
+static bool cg_hybrid_init(struct cgroup_ops *ops)
+{
+	int ret;
+	char *basecginfo;
+	bool will_escape;
+	FILE *f;
+	size_t len = 0;
+	char *line = NULL;
+	char **klist = NULL, **nlist = NULL;
+
+	/* Root spawned containers escape the current cgroup, so use init's
+	 * cgroups as our base in that case.
+	 */
+	will_escape = (geteuid() == 0);
+	if (will_escape)
+		basecginfo = read_file("/proc/1/cgroup");
+	else
+		basecginfo = read_file("/proc/self/cgroup");
+	if (!basecginfo)
+		return false;
+
+	ret = get_existing_subsystems(&klist, &nlist);
+	if (ret < 0) {
+		ERROR("Failed to retrieve available legacy cgroup controllers");
+		free(basecginfo);
+		return false;
+	}
+
+	f = fopen("/proc/self/mountinfo", "r");
+	if (!f) {
+		ERROR("Failed to open \"/proc/self/mountinfo\"");
+		free(basecginfo);
+		return false;
+	}
+
+	lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
+
+	while (getline(&line, &len, f) != -1) {
+		int type;
+		bool writeable;
+		struct hierarchy *new;
+		char *base_cgroup = NULL, *mountpoint = NULL;
+		char **controller_list = NULL;
+
+		type = get_cgroup_version(line);
+		if (type == 0)
+			continue;
+
+		if (type == CGROUP2_SUPER_MAGIC && ops->unified)
+			continue;
+
+		if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+			else if (type == CGROUP_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
+		} else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+			if (type == CGROUP_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		} else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		}
+
+		controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
+		if (!controller_list && type == CGROUP_SUPER_MAGIC)
+			continue;
+
+		if (type == CGROUP_SUPER_MAGIC)
+			if (controller_list_is_dup(ops->hierarchies, controller_list))
+				goto next;
+
+		mountpoint = cg_hybrid_get_mountpoint(line);
+		if (!mountpoint) {
+			ERROR("Failed parsing mountpoint from \"%s\"", line);
+			goto next;
+		}
+
+		if (type == CGROUP_SUPER_MAGIC)
+			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
+		else
+			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
+		if (!base_cgroup) {
+			ERROR("Failed to find current cgroup");
+			goto next;
+		}
+
+		trim(base_cgroup);
+		prune_init_scope(base_cgroup);
+		if (type == CGROUP2_SUPER_MAGIC)
+			writeable = test_writeable_v2(mountpoint, base_cgroup);
+		else
+			writeable = test_writeable_v1(mountpoint, base_cgroup);
+		if (!writeable)
+			goto next;
+
+		if (type == CGROUP2_SUPER_MAGIC) {
+			char *cgv2_ctrl_path;
+
+			cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
+							"cgroup.controllers",
+							NULL);
+
+			controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
+			free(cgv2_ctrl_path);
+			if (!controller_list) {
+				controller_list = cg_unified_make_empty_controller();
+				TRACE("No controllers are enabled for "
+				      "delegation in the unified hierarchy");
+			}
+		}
+
+		new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
+		if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
+			ops->unified = new;
+
+		continue;
+
+	next:
+		free_string_list(controller_list);
+		free(mountpoint);
+		free(base_cgroup);
+	}
+
+	free_string_list(klist);
+	free_string_list(nlist);
+
+	free(basecginfo);
+
+	fclose(f);
+	free(line);
+
+	TRACE("Writable cgroup hierarchies:");
+	lxc_cgfsng_print_hierarchies(ops);
+
+	/* verify that all controllers in cgroup.use and all crucial
+	 * controllers are accounted for
+	 */
+	if (!all_controllers_found(ops))
+		return false;
+
+	return true;
+}
+
+static int cg_is_pure_unified(void)
+{
+
+	int ret;
+	struct statfs fs;
+
+	ret = statfs("/sys/fs/cgroup", &fs);
+	if (ret < 0)
+		return -ENOMEDIUM;
+
+	if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
+		return CGROUP2_SUPER_MAGIC;
+
+	return 0;
+}
+
+/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
+static char *cg_unified_get_current_cgroup(void)
+{
+	char *basecginfo, *base_cgroup;
+	bool will_escape;
+	char *copy = NULL;
+
+	will_escape = (geteuid() == 0);
+	if (will_escape)
+		basecginfo = read_file("/proc/1/cgroup");
+	else
+		basecginfo = read_file("/proc/self/cgroup");
+	if (!basecginfo)
+		return NULL;
+
+	base_cgroup = strstr(basecginfo, "0::/");
+	if (!base_cgroup)
+		goto cleanup_on_err;
+
+	base_cgroup = base_cgroup + 3;
+	copy = copy_to_eol(base_cgroup);
+	if (!copy)
+		goto cleanup_on_err;
+
+cleanup_on_err:
+	free(basecginfo);
+	if (copy)
+		trim(copy);
+
+	return copy;
+}
+
+static int cg_unified_init(struct cgroup_ops *ops)
+{
+	int ret;
+	char *mountpoint, *subtree_path;
+	char **delegatable;
+	char *base_cgroup = NULL;
+
+	ret = cg_is_pure_unified();
+	if (ret == -ENOMEDIUM)
+		return -ENOMEDIUM;
+
+	if (ret != CGROUP2_SUPER_MAGIC)
+		return 0;
+
+	base_cgroup = cg_unified_get_current_cgroup();
+	if (!base_cgroup)
+		return -EINVAL;
+	prune_init_scope(base_cgroup);
+
+	/* We assume that we have already been given controllers to delegate
+	 * further down the hierarchy. If not it is up to the user to delegate
+	 * them to us.
+	 */
+	mountpoint = must_copy_string("/sys/fs/cgroup");
+	subtree_path = must_make_path(mountpoint, base_cgroup,
+				      "cgroup.subtree_control", NULL);
+	delegatable = cg_unified_get_controllers(subtree_path);
+	free(subtree_path);
+	if (!delegatable)
+		delegatable = cg_unified_make_empty_controller();
+	if (!delegatable[0])
+		TRACE("No controllers are enabled for delegation");
+
+	/* TODO: If the user requested specific controllers via lxc.cgroup.use
+	 * we should verify here. The reason I'm not doing it right is that I'm
+	 * not convinced that lxc.cgroup.use will be the future since it is a
+	 * global property. I much rather have an option that lets you request
+	 * controllers per container.
+	 */
+
+	add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
+
+	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+	return CGROUP2_SUPER_MAGIC;
+}
+
+static bool cg_init(struct cgroup_ops *ops)
+{
+	int ret;
+	const char *tmp;
+
+	tmp = lxc_global_config_value("lxc.cgroup.use");
+	if (tmp)
+		ops->cgroup_use = must_copy_string(tmp);
+
+	ret = cg_unified_init(ops);
+	if (ret < 0)
+		return false;
+
+	if (ret == CGROUP2_SUPER_MAGIC)
+		return true;
+
+	return cg_hybrid_init(ops);
+}
+
+static bool cgfsng_data_init(struct cgroup_ops *ops)
+{
+	const char *cgroup_pattern;
+
+	/* copy system-wide cgroup information */
+	cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
+	if (!cgroup_pattern) {
+		/* lxc.cgroup.pattern is only NULL on error. */
+		ERROR("Failed to retrieve cgroup pattern");
+		return false;
+	}
+	ops->cgroup_pattern = must_copy_string(cgroup_pattern);
+
+	return true;
+}
+
+struct cgroup_ops *cgfsng_ops_init(void)
+{
+	struct cgroup_ops *cgfsng_ops;
+
+	cgfsng_ops = malloc(sizeof(struct cgroup_ops));
+	if (!cgfsng_ops)
+		return NULL;
+
+	memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
+	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
+
+	if (!cg_init(cgfsng_ops)) {
+		free(cgfsng_ops);
+		return NULL;
+	}
+
+	cgfsng_ops->data_init = cgfsng_data_init;
+	cgfsng_ops->destroy = cgfsng_destroy;
+	cgfsng_ops->create = cgfsng_create;
+	cgfsng_ops->enter = cgfsng_enter;
+	cgfsng_ops->escape = cgfsng_escape;
+	cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
+	cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
+	cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
+	cgfsng_ops->get = cgfsng_get;
+	cgfsng_ops->set = cgfsng_set;
+	cgfsng_ops->unfreeze = cgfsng_unfreeze;
+	cgfsng_ops->setup_limits = cgfsng_setup_limits;
+	cgfsng_ops->driver = "cgfsng";
+	cgfsng_ops->version = "1.0.0";
+	cgfsng_ops->attach = cgfsng_attach;
+	cgfsng_ops->chown = cgfsng_chown;
+	cgfsng_ops->mount = cgfsng_mount;
+	cgfsng_ops->nrtasks = cgfsng_nrtasks;
+
+	return cgfsng_ops;
+}
--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -32,180 +32,61 @@

 lxc_log_define(lxc_cgroup, lxc);

-static struct cgroup_ops *ops = NULL;
-
 extern struct cgroup_ops *cgfsng_ops_init(void);

-__attribute__((constructor)) void cgroup_ops_init(void)
+struct cgroup_ops *cgroup_init(struct lxc_handler *handler)
 {
-	if (ops) {
-		INFO("Running with %s in version %s", ops->driver, ops->version);
-		return;
-	}
-
-	DEBUG("cgroup_init");
-	ops = cgfsng_ops_init();
-	if (ops)
-		INFO("Initialized cgroup driver %s", ops->driver);
-}
+	struct cgroup_ops *cgroup_ops;

-bool cgroup_init(struct lxc_handler *handler)
-{
-	if (handler->cgroup_data) {
-		ERROR("cgroup_init called on already initialized handler");
-		return true;
+	cgroup_ops = cgfsng_ops_init();
+	if (!cgroup_ops) {
+		ERROR("Failed to initialize cgroup driver");
+		return NULL;
 	}

-	if (ops) {
-		INFO("cgroup driver %s initing for %s", ops->driver, handler->name);
-		handler->cgroup_data = ops->init(handler);
-	}
+	if (!cgroup_ops->data_init(cgroup_ops))
+		return NULL;

-	return handler->cgroup_data != NULL;
-}
+	TRACE("Initialized cgroup driver %s", cgroup_ops->driver);

-void cgroup_destroy(struct lxc_handler *handler)
-{
-	if (ops) {
-		ops->destroy(handler->cgroup_data, handler->conf);
-		handler->cgroup_data = NULL;
-	}
-}
+	if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_LEGACY)
+		TRACE("Running with legacy cgroup layout");
+	else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_HYBRID)
+		TRACE("Running with hybrid cgroup layout");
+	else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
+		TRACE("Running with unified cgroup layout");
+	else
+		WARN("Running with unknown cgroup layout");

-/* Create the container cgroups for all requested controllers. */
-bool cgroup_create(struct lxc_handler *handler)
-{
-	if (ops)
-		return ops->create(handler->cgroup_data);
-
-	return false;
+	return cgroup_ops;
 }

-/* Enter the container init into its new cgroups for all requested controllers. */
-bool cgroup_enter(struct lxc_handler *handler)
+void cgroup_exit(struct cgroup_ops *ops)
 {
-	if (ops)
-		return ops->enter(handler->cgroup_data, handler->pid);
+	struct hierarchy **it;

-	return false;
-}
-
-bool cgroup_create_legacy(struct lxc_handler *handler)
-{
-	if (ops && ops->create_legacy)
-		return ops->create_legacy(handler->cgroup_data, handler->pid);
-
-	return true;
-}
-
-const char *cgroup_get_cgroup(struct lxc_handler *handler,
-			      const char *subsystem)
-{
-	if (ops)
-		return ops->get_cgroup(handler->cgroup_data, subsystem);
-
-	return NULL;
-}
-
-bool cgroup_escape(struct lxc_handler *handler)
-{
-	if (ops)
-		return ops->escape(handler->cgroup_data);
-
-	return false;
-}
-
-int cgroup_num_hierarchies(void)
-{
-	if (!ops)
-		return -1;
-
-	return ops->num_hierarchies();
-}
-
-bool cgroup_get_hierarchies(int n, char ***out)
-{
 	if (!ops)
-		return false;
-
-	return ops->get_hierarchies(n, out);
-}
-
-bool cgroup_unfreeze(struct lxc_handler *handler)
-{
-	if (ops)
-		return ops->unfreeze(handler->cgroup_data);
-
-	return false;
-}
-
-bool cgroup_setup_limits(struct lxc_handler *handler, bool with_devices)
-{
-	if (ops)
-		return ops->setup_limits(handler->cgroup_data,
-					 handler->conf, with_devices);
-
-	return false;
-}
+		return;

-bool cgroup_chown(struct lxc_handler *handler)
-{
-	if (ops && ops->chown)
-		return ops->chown(handler->cgroup_data, handler->conf);
+	free(ops->cgroup_use);
+	free(ops->cgroup_pattern);
+	free(ops->container_cgroup);

-	return true;
-}
+	for (it = ops->hierarchies; it && *it; it++) {
+		char **ctrlr;

-bool cgroup_mount(const char *root, struct lxc_handler *handler, int type)
-{
-	if (ops)
-		return ops->mount_cgroup(handler, root, type);
-
-	return false;
-}
+		for (ctrlr = (*it)->controllers; ctrlr && *ctrlr; ctrlr++)
+			free(*ctrlr);
+		free((*it)->controllers);

-int cgroup_nrtasks(struct lxc_handler *handler)
-{
-	if (ops) {
-		if (ops->nrtasks)
-			return ops->nrtasks(handler->cgroup_data);
-		else
-			WARN("cgroup driver \"%s\" doesn't implement nrtasks", ops->driver);
+		free((*it)->mountpoint);
+		free((*it)->base_cgroup);
+		free((*it)->fullcgpath);
+		free(*it);
 	}
+	free(ops->hierarchies);

-	return -1;
-}
-
-bool cgroup_attach(const char *name, const char *lxcpath, pid_t pid)
-{
-	if (ops)
-		return ops->attach(name, lxcpath, pid);
-
-	return false;
-}
-
-int lxc_cgroup_set(const char *filename, const char *value, const char *name,
-		   const char *lxcpath)
-{
-	if (ops)
-		return ops->set(filename, value, name, lxcpath);
-
-	return -1;
-}
-
-int lxc_cgroup_get(const char *filename, char *value, size_t len,
-		   const char *name, const char *lxcpath)
-{
-	if (ops)
-		return ops->get(filename, value, len, name, lxcpath);
-
-	return -1;
-}
-
-void cgroup_disconnect(void)
-{
-	if (ops && ops->disconnect)
-		ops->disconnect();
+	return;
 }

 #define INIT_SCOPE "/init.scope"

--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -39,48 +39,114 @@ typedef enum {
        CGROUP_LAYOUT_UNIFIED =  2,
 } cgroup_layout_t;

+/* A descriptor for a mounted hierarchy
+ *
+ * @controllers
+ * - legacy hierarchy
+ *   Either NULL, or a null-terminated list of all the co-mounted controllers.
+ * - unified hierarchy
+ *   Either NULL, or a null-terminated list of all enabled controllers.
+ *
+ * @mountpoint
+ * - The mountpoint we will use.
+ * - legacy hierarchy
+ *   It will be either /sys/fs/cgroup/controller or
+ *   /sys/fs/cgroup/controllerlist.
+ * - unified hierarchy
+ *   It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name>
+ *   depending on whether this is a hybrid cgroup layout (mix of legacy and
+ *   unified hierarchies) or a pure unified cgroup layout.
+ *
+ * @base_cgroup
+ * - The cgroup under which the container cgroup path
+ *   is created. This will be either the caller's cgroup (if not root), or
+ *   init's cgroup (if root).
+ *
+ * @fullcgpath
+ * - The full path to the containers cgroup.
+ *
+ * @version
+ * - legacy hierarchy
+ *   If the hierarchy is a legacy hierarchy this will be set to
+ *   CGROUP_SUPER_MAGIC.
+ * - unified hierarchy
+ *   If the hierarchy is a legacy hierarchy this will be set to
+ *   CGROUP2_SUPER_MAGIC.
+ */
+struct hierarchy {
+	char **controllers;
+	char *mountpoint;
+	char *base_cgroup;
+	char *fullcgpath;
+	int version;
+};
+
 struct cgroup_ops {
+	/* string constant */
 	const char *driver;
+
+	/* string constant */
 	const char *version;

-	void *(*init)(struct lxc_handler *handler);
-	void (*destroy)(void *hdata, struct lxc_conf *conf);
-	bool (*create)(void *hdata);
-	bool (*enter)(void *hdata, pid_t pid);
-	bool (*create_legacy)(void *hdata, pid_t pid);
-	const char *(*get_cgroup)(void *hdata, const char *subsystem);
-	bool (*escape)();
-	int (*num_hierarchies)();
-	bool (*get_hierarchies)(int n, char ***out);
-	int (*set)(const char *filename, const char *value, const char *name, const char *lxcpath);
-	int (*get)(const char *filename, char *value, size_t len, const char *name, const char *lxcpath);
-	bool (*unfreeze)(void *hdata);
-	bool (*setup_limits)(void *hdata, struct lxc_conf *conf, bool with_devices);
-	bool (*chown)(void *hdata, struct lxc_conf *conf);
-	bool (*attach)(const char *name, const char *lxcpath, pid_t pid);
-	bool (*mount_cgroup)(void *hdata, const char *root, int type);
-	int (*nrtasks)(void *hdata);
-	void (*disconnect)(void);
+	/* What controllers is the container supposed to use. */
+	char *cgroup_use;
+	char *cgroup_pattern;
+	char *container_cgroup;
+
+	/* @hierarchies
+	 * - A NULL-terminated array of struct hierarchy, one per legacy
+	 *   hierarchy. No duplicates. First sufficient, writeable mounted
+	 *   hierarchy wins.
+	 */
+	struct hierarchy **hierarchies;
+	struct hierarchy *unified;
+
+	/*
+	 * @cgroup_layout
+	 * - What cgroup layout the container is running with.
+	 *   - CGROUP_LAYOUT_UNKNOWN
+	 *     The cgroup layout could not be determined. This should be treated
+	 *     as an error condition.
+	 *   - CGROUP_LAYOUT_LEGACY
+	 *     The container is running with all controllers mounted into legacy
+	 *     cgroup hierarchies.
+	 *   - CGROUP_LAYOUT_HYBRID
+	 *     The container is running with at least one controller mounted
+	 *     into a legacy cgroup hierarchy and a mountpoint for the unified
+	 *     hierarchy. The unified hierarchy can be empty (no controllers
+	 *     enabled) or non-empty (controllers enabled).
+	 *   - CGROUP_LAYOUT_UNIFIED
+	 *     The container is running on a pure unified cgroup hierarchy. The
+	 *     unified hierarchy can be empty (no controllers enabled) or
+	 *     non-empty (controllers enabled).
+	 */
+	cgroup_layout_t cgroup_layout;
+
+	bool (*data_init)(struct cgroup_ops *ops);
+	void (*destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
+	bool (*create)(struct cgroup_ops *ops, struct lxc_handler *handler);
+	bool (*enter)(struct cgroup_ops *ops, pid_t pid);
+	const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
+	bool (*escape)(const struct cgroup_ops *ops);
+	int (*num_hierarchies)(struct cgroup_ops *ops);
+	bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
+	int (*set)(struct cgroup_ops *ops, const char *filename,
+		   const char *value, const char *name, const char *lxcpath);
+	int (*get)(struct cgroup_ops *ops, const char *filename, char *value,
+		   size_t len, const char *name, const char *lxcpath);
+	bool (*unfreeze)(struct cgroup_ops *ops);
+	bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
+			     bool with_devices);
+	bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
+	bool (*attach)(struct cgroup_ops *ops, const char *name,
+		       const char *lxcpath, pid_t pid);
+	bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
+		      const char *root, int type);
+	int (*nrtasks)(struct cgroup_ops *ops);
 };

-extern bool cgroup_attach(const char *name, const char *lxcpath, pid_t pid);
-extern bool cgroup_mount(const char *root, struct lxc_handler *handler, int type);
-extern void cgroup_destroy(struct lxc_handler *handler);
-extern bool cgroup_init(struct lxc_handler *handler);
-extern bool cgroup_create(struct lxc_handler *handler);
-extern bool cgroup_setup_limits(struct lxc_handler *handler, bool with_devices);
-extern bool cgroup_chown(struct lxc_handler *handler);
-extern bool cgroup_enter(struct lxc_handler *handler);
-extern void cgroup_cleanup(struct lxc_handler *handler);
-extern bool cgroup_create_legacy(struct lxc_handler *handler);
-extern int cgroup_nrtasks(struct lxc_handler *handler);
-extern const char *cgroup_get_cgroup(struct lxc_handler *handler,
-				     const char *subsystem);
-extern bool cgroup_escape();
-extern int cgroup_num_hierarchies();
-extern bool cgroup_get_hierarchies(int i, char ***out);
-extern bool cgroup_unfreeze(struct lxc_handler *handler);
-extern void cgroup_disconnect(void);
+extern struct cgroup_ops *cgroup_init(struct lxc_handler *handler);
+extern void cgroup_exit(struct cgroup_ops *ops);

 extern void prune_init_scope(char *cg);
 extern bool is_crucial_cgroup_subsystem(const char *s);

--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -473,11 +473,12 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 {
 	const char *path;
 	struct lxc_cmd_rsp rsp;
+	struct cgroup_ops *cgroup_ops = handler->cgroup_ops;

 	if (req->datalen > 0)
-		path = cgroup_get_cgroup(handler, req->data);
+		path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
 	else
-		path = cgroup_get_cgroup(handler, NULL);
+		path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
 	if (!path)
 		return -1;

@@ -637,6 +638,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
 {
 	struct lxc_cmd_rsp rsp;
 	int stopsignal = SIGKILL;
+	struct cgroup_ops *cgroup_ops = handler->cgroup_ops;

 	if (handler->conf->stopsignal)
 		stopsignal = handler->conf->stopsignal;
@@ -648,7 +650,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
 		 * lxc_unfreeze() would do another cmd (GET_CGROUP) which would
 		 * deadlock us.
 		 */
-		if (cgroup_unfreeze(handler))
+		if (cgroup_ops->unfreeze(cgroup_ops))
 			return 0;

 		ERROR("Failed to unfreeze container \"%s\"", handler->name);

--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -757,7 +757,10 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
 		if (flags & LXC_AUTO_CGROUP_FORCE)
 			cg_flags |= LXC_AUTO_CGROUP_FORCE;

-		if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
+		if (!handler->cgroup_ops->mount(handler->cgroup_ops,
+						handler,
+						conf->rootfs.path ? conf->rootfs.mount : "",
+						cg_flags)) {
 			SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
 			return -1;
 		}

--- a/src/lxc/criu.c
+++ b/src/lxc/criu.c
@@ -171,7 +171,7 @@ static int cmp_version(const char *v1, const char *v2)
 	return -1;
 }

-static void exec_criu(struct criu_opts *opts)
+static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
 {
 	char **argv, log[PATH_MAX];
 	int static_args = 23, argc = 0, i, ret;
@@ -190,7 +190,7 @@ static void exec_criu(struct criu_opts *opts)
 	 * /actual/ root cgroup so that lxcfs thinks criu has enough rights to
 	 * see all cgroups.
 	 */
-	if (!cgroup_escape()) {
+	if (!cgroup_ops->escape(cgroup_ops)) {
 		ERROR("failed to escape cgroups");
 		return;
 	}
@@ -248,8 +248,8 @@ static void exec_criu(struct criu_opts *opts)
 		return;
 	}

-	if (cgroup_num_hierarchies() > 0)
-		static_args += 2 * cgroup_num_hierarchies();
+	if (cgroup_ops->num_hierarchies(cgroup_ops) > 0)
+		static_args += 2 * cgroup_ops->num_hierarchies(cgroup_ops);

 	if (opts->user->verbose)
 		static_args++;
@@ -306,11 +306,11 @@ static void exec_criu(struct criu_opts *opts)
 	DECLARE_ARG("-o");
 	DECLARE_ARG(log);

-	for (i = 0; i < cgroup_num_hierarchies(); i++) {
+	for (i = 0; i < cgroup_ops->num_hierarchies(cgroup_ops); i++) {
 		char **controllers = NULL, *fullname;
 		char *path, *tmp;

-		if (!cgroup_get_hierarchies(i, &controllers)) {
+		if (!cgroup_ops->get_hierarchies(cgroup_ops, i, &controllers)) {
 			ERROR("failed to get hierarchy %d", i);
 			goto err;
 		}
@@ -328,7 +328,7 @@ static void exec_criu(struct criu_opts *opts)
 		} else {
 			const char *p;

-			p = cgroup_get_cgroup(opts->handler, controllers[0]);
+			p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
 			if (!p) {
 				ERROR("failed to get cgroup path for %s", controllers[0]);
 				goto err;
@@ -937,6 +937,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
 	struct lxc_handler *handler;
 	int status = 0;
 	int pipes[2] = {-1, -1};
+	struct cgroup_ops *cgroup_ops;

 	/* Try to detach from the current controlling tty if it exists.
 	 * Othwerise, lxc_init (via lxc_console) will attach the container's
@@ -958,12 +959,12 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
 	if (lxc_init(c->name, handler) < 0)
 		goto out;

-	if (!cgroup_init(handler)) {
-		ERROR("failed initing cgroups");
+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
 		goto out_fini_handler;
-	}
+	handler->cgroup_ops = cgroup_ops;

-	if (!cgroup_create(handler)) {
+	if (!cgroup_ops->create(cgroup_ops, handler)) {
 		ERROR("failed creating groups");
 		goto out_fini_handler;
 	}
@@ -1052,7 +1053,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
 		os.console_name = c->lxc_conf->console.name;

 		/* exec_criu() returning is an error */
-		exec_criu(&os);
+		exec_criu(cgroup_ops, &os);
 		umount(rootfs->mount);
 		rmdir(rootfs->mount);
 		goto out_fini_handler;
@@ -1253,16 +1254,21 @@ static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *op
 	if (pid == 0) {
 		struct criu_opts os;
 		struct lxc_handler h;
+		struct cgroup_ops *cgroup_ops;

 		close(criuout[0]);

 		lxc_zero_handler(&h);

 		h.name = c->name;
-		if (!cgroup_init(&h)) {
+
+		cgroup_ops = cgroup_init(NULL);
+		if (!cgroup_ops) {
 			ERROR("failed to cgroup_init()");
 			_exit(EXIT_FAILURE);
+			return -1;
 		}
+		h.cgroup_ops = cgroup_ops;

 		os.pipefd = criuout[1];
 		os.action = mode;
@@ -1278,7 +1284,7 @@ static bool do_dump(struct lxc_container *c, char *mode, struct migrate_opts *op
 		}

 		/* exec_criu() returning is an error */
-		exec_criu(&os);
+		exec_criu(cgroup_ops, &os);
 		free(criu_version);
 		_exit(EXIT_FAILURE);
 	} else {

--- a/src/lxc/freezer.c
+++ b/src/lxc/freezer.c
@@ -31,6 +31,7 @@
 #include <sys/types.h>
 #include <sys/param.h>

+#include "cgroup.h"
 #include "commands.h"
 #include "error.h"
 #include "log.h"
@@ -45,8 +46,14 @@ lxc_state_t freezer_state(const char *name, const char *lxcpath)
 {
 	int ret;
 	char v[100];
+	struct cgroup_ops *cgroup_ops;

-	ret = lxc_cgroup_get("freezer.state", v, sizeof(v), name, lxcpath);
+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
+		return -1;
+
+	ret = cgroup_ops->get(cgroup_ops, "freezer.state", v, sizeof(v), name, lxcpath);
+	cgroup_exit(cgroup_ops);
 	if (ret < 0)
 		return -1;

@@ -60,19 +67,26 @@ static int do_freeze_thaw(bool freeze, const char *name, const char *lxcpath)
 {
 	int ret;
 	char v[100];
+	struct cgroup_ops *cgroup_ops;
 	const char *state = freeze ? "FROZEN" : "THAWED";
 	size_t state_len = 6;
 	lxc_state_t new_state = freeze ? FROZEN : THAWED;

-	ret = lxc_cgroup_set("freezer.state", state, name, lxcpath);
+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
+		return -1;
+
+	ret = cgroup_ops->set(cgroup_ops, "freezer.state", state, name, lxcpath);
 	if (ret < 0) {
+		cgroup_exit(cgroup_ops);
 		ERROR("Failed to freeze %s", name);
 		return -1;
 	}

 	for (;;) {
-		ret = lxc_cgroup_get("freezer.state", v, sizeof(v), name, lxcpath);
+		ret = cgroup_ops->get(cgroup_ops, "freezer.state", v, sizeof(v), name, lxcpath);
 		if (ret < 0) {
+			cgroup_exit(cgroup_ops);
 			ERROR("Failed to get freezer state of %s", name);
 			return -1;
 		}
@@ -82,6 +96,7 @@ static int do_freeze_thaw(bool freeze, const char *name, const char *lxcpath)

 		ret = strncmp(v, state, state_len);
 		if (ret == 0) {
+			cgroup_exit(cgroup_ops);
 			lxc_cmd_serve_state_clients(name, lxcpath, new_state);
 			lxc_monitor_send_state(name, new_state, lxcpath);
 			return 0;

--- a/src/lxc/lxc.h
+++ b/src/lxc/lxc.h
@@ -98,29 +98,6 @@ extern int lxc_unfreeze(const char *name, const char *lxcpath);
 extern lxc_state_t lxc_state(const char *name, const char *lxcpath);

 /*
- * Set a specified value for a specified subsystem. The specified
- * subsystem must be fully specified, eg. "cpu.shares"
- * @filename  : the cgroup attribute filename
- * @value     : the value to be set
- * @name      : the name of the container
- * @lxcpath   : lxc config path for container
- * Returns 0 on success, < 0 otherwise
- */
-extern int lxc_cgroup_set(const char *filename, const char *value, const char *name, const char *lxcpath);
-
-/*
- * Get a specified value for a specified subsystem. The specified
- * subsystem must be fully specified, eg. "cpu.shares"
- * @filename  : the cgroup attribute filename
- * @value     : the value to be set
- * @len       : the len of the value variable
- * @name      : the name of the container
- * @lxcpath   : lxc config path for container
- * Returns the number of bytes read, < 0 on error
- */
-extern int lxc_cgroup_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath);
-
-/*
 * Create and return a new lxccontainer struct.
 */
 extern struct lxc_container *lxc_container_new(const char *name, const char *configpath);

--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -3141,6 +3141,7 @@ WRAP_API_1(bool, lxcapi_set_config_path, const char *)
 static bool do_lxcapi_set_cgroup_item(struct lxc_container *c, const char *subsys, const char *value)
 {
 	int ret;
+	struct cgroup_ops *cgroup_ops;

 	if (!c)
 		return false;
@@ -3148,12 +3149,19 @@ static bool do_lxcapi_set_cgroup_item(struct lxc_container *c, const char *subsy
 	if (is_stopped(c))
 		return false;

+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
+		return false;
+
 	if (container_disk_lock(c))
 		return false;

-	ret = lxc_cgroup_set(subsys, value, c->name, c->config_path);
+	ret = cgroup_ops->set(cgroup_ops, subsys, value, c->name, c->config_path);

 	container_disk_unlock(c);
+
+	cgroup_exit(cgroup_ops);
+
 	return ret == 0;
 }

@@ -3162,6 +3170,7 @@ WRAP_API_2(bool, lxcapi_set_cgroup_item, const char *, const char *)
 static int do_lxcapi_get_cgroup_item(struct lxc_container *c, const char *subsys, char *retv, int inlen)
 {
 	int ret;
+	struct cgroup_ops *cgroup_ops;

 	if (!c)
 		return -1;
@@ -3169,12 +3178,20 @@ static int do_lxcapi_get_cgroup_item(struct lxc_container *c, const char *subsys
 	if (is_stopped(c))
 		return -1;

+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
+		return -1;
+
 	if (container_disk_lock(c))
 		return -1;

-	ret = lxc_cgroup_get(subsys, retv, inlen, c->name, c->config_path);
+	ret = cgroup_ops->get(cgroup_ops, subsys, retv, inlen, c->name,
+			      c->config_path);

 	container_disk_unlock(c);
+
+	cgroup_exit(cgroup_ops);
+
 	return ret;
 }


--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -849,6 +849,13 @@ int lxc_init(const char *name, struct lxc_handler *handler)
 	}
 	TRACE("Chowned console");

+	handler->cgroup_ops = cgroup_init(handler);
+	if (!handler->cgroup_ops) {
+		ERROR("Failed to initialize cgroup driver");
+		goto out_restore_sigmask;
+	}
+	TRACE("Initialized cgroup driver");
+
 	INFO("Container \"%s\" is initialized", name);
 	return 0;

@@ -871,6 +878,7 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 	struct lxc_list *cur, *next;
 	char *namespaces[LXC_NS_MAX + 1];
 	size_t namespace_count = 0;
+	struct cgroup_ops *cgroup_ops = handler->cgroup_ops;

 	/* The STOPPING state is there for future cleanup code which can take
 	 * awhile.
@@ -935,7 +943,8 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 	while (namespace_count--)
 		free(namespaces[namespace_count]);

-	cgroup_destroy(handler);
+	cgroup_ops->destroy(cgroup_ops, handler);
+	cgroup_exit(cgroup_ops);

 	if (handler->conf->reboot == 0) {
 		/* For all new state clients simply close the command socket.
@@ -1506,8 +1515,9 @@ static int lxc_spawn(struct lxc_handler *handler)
 	struct lxc_list *id_map;
 	const char *name = handler->name;
 	const char *lxcpath = handler->lxcpath;
-	bool cgroups_connected = false, share_ns = false;
+	bool share_ns = false;
 	struct lxc_conf *conf = handler->conf;
+	struct cgroup_ops *cgroup_ops = handler->cgroup_ops;

 	id_map = &conf->id_map;
 	wants_to_map_ids = !lxc_list_empty(id_map);
@@ -1567,14 +1577,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 		}
 	}

-	if (!cgroup_init(handler)) {
-		ERROR("Failed initializing cgroup support");
-		goto out_delete_net;
-	}
-
-	cgroups_connected = true;
-
-	if (!cgroup_create(handler)) {
+	if (!cgroup_ops->create(cgroup_ops, handler)) {
 		ERROR("Failed creating cgroups");
 		goto out_delete_net;
 	}
@@ -1663,15 +1666,15 @@ static int lxc_spawn(struct lxc_handler *handler)
 	if (ret < 0)
 		goto out_delete_net;

-	if (!cgroup_setup_limits(handler, false)) {
+	if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, false)) {
 		ERROR("Failed to setup cgroup limits for container \"%s\"", name);
 		goto out_delete_net;
 	}

-	if (!cgroup_enter(handler))
+	if (!cgroup_ops->enter(cgroup_ops, handler->pid))
 		goto out_delete_net;

-	if (!cgroup_chown(handler))
+	if (!cgroup_ops->chown(cgroup_ops, handler->conf))
 		goto out_delete_net;

 	/* Now we're ready to preserve the network namespace */
@@ -1736,15 +1739,12 @@ static int lxc_spawn(struct lxc_handler *handler)
 	if (ret < 0)
 		goto out_delete_net;

-	if (!cgroup_setup_limits(handler, true)) {
+	if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
 		ERROR("Failed to setup legacy device cgroup controller limits");
 		goto out_delete_net;
 	}
 	TRACE("Set up legacy device cgroup controller limits");

-	cgroup_disconnect();
-	cgroups_connected = false;
-
 	if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
 		/* Now we're ready to preserve the cgroup namespace */
 		ret = lxc_try_preserve_ns(handler->pid, "cgroup");
@@ -1821,9 +1821,6 @@ static int lxc_spawn(struct lxc_handler *handler)
 	return 0;

 out_delete_net:
-	if (cgroups_connected)
-		cgroup_disconnect();
-
 	if (handler->ns_clone_flags & CLONE_NEWNET)
 		lxc_delete_network(handler);


--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -132,6 +132,8 @@ struct lxc_handler {
 	 * true.
 	 */
 	int exit_status;
+
+	struct cgroup_ops *cgroup_ops;
 };

 struct execute_args {

--- a/src/tests/cgpath.c
+++ b/src/tests/cgpath.c
@@ -53,6 +53,7 @@ static int test_running_container(const char *lxcpath,
 	char *cgrelpath;
 	char  relpath[PATH_MAX+1];
 	char  value[NAME_MAX], value_save[NAME_MAX];
+	struct cgroup_ops *cgroup_ops;

 	sprintf(relpath, "%s/%s", group ? group : "lxc", name);

@@ -75,36 +76,41 @@ static int test_running_container(const char *lxcpath,
 		goto err3;
 	}

+	cgroup_ops = cgroup_init(NULL);
+	if (!cgroup_ops)
+		goto err3;
+
 	/* test get/set value using memory.soft_limit_in_bytes file */
-	ret = lxc_cgroup_get("memory.soft_limit_in_bytes", value, sizeof(value),
-			     c->name, c->config_path);
+	ret = cgroup_ops->get(cgroup_ops, "memory.soft_limit_in_bytes", value,
+			      sizeof(value), c->name, c->config_path);
 	if (ret < 0) {
-		TSTERR("lxc_cgroup_get failed");
+		TSTERR("cgroup_get failed");
 		goto err3;
 	}
 	strcpy(value_save, value);

-	ret = lxc_cgroup_set("memory.soft_limit_in_bytes", "512M", c->name, c->config_path);
+	ret = cgroup_ops->set(cgroup_ops, "memory.soft_limit_in_bytes", "512M",
+			      c->name, c->config_path);
 	if (ret < 0) {
-		TSTERR("lxc_cgroup_set failed %d %d", ret, errno);
+		TSTERR("cgroup_set failed %d %d", ret, errno);
 		goto err3;
 	}
-	ret = lxc_cgroup_get("memory.soft_limit_in_bytes", value, sizeof(value),
-			     c->name, c->config_path);
+	ret = cgroup_ops->get(cgroup_ops, "memory.soft_limit_in_bytes", value,
+			      sizeof(value), c->name, c->config_path);
 	if (ret < 0) {
-		TSTERR("lxc_cgroup_get failed");
+		TSTERR("cgroup_get failed");
 		goto err3;
 	}
 	if (strcmp(value, "536870912\n")) {
-		TSTERR("lxc_cgroup_set_bypath failed to set value >%s<", value);
+		TSTERR("cgroup_set_bypath failed to set value >%s<", value);
 		goto err3;
 	}

 	/* restore original value */
-	ret = lxc_cgroup_set("memory.soft_limit_in_bytes", value_save,
-			     c->name, c->config_path);
+	ret = cgroup_ops->set(cgroup_ops, "memory.soft_limit_in_bytes",
+			      value_save, c->name, c->config_path);
 	if (ret < 0) {
-		TSTERR("lxc_cgroup_set failed");
+		TSTERR("cgroup_set failed");
 		goto err3;
 	}