Unverified Commit 858f6225 by Stéphane Graber Committed by GitHub

Merge pull request #3675 from brauner/2021-02-16/fixes

cgroups: second batch of cgroup fixes
parents 136b349c 060e54d6
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include "memory_utils.h" #include "memory_utils.h"
#include "mount_utils.h" #include "mount_utils.h"
#include "storage/storage.h" #include "storage/storage.h"
#include "string_utils.h"
#include "syscall_wrappers.h" #include "syscall_wrappers.h"
#include "utils.h" #include "utils.h"
...@@ -312,234 +313,11 @@ static ssize_t get_max_cpus(char *cpulist) ...@@ -312,234 +313,11 @@ static ssize_t get_max_cpus(char *cpulist)
return cpus; return cpus;
} }
#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
char *child_cgroup, bool am_initialized)
{
__do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
*offlinecpus = NULL, *posscpus = NULL;
__do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
*possmask = NULL;
int ret;
ssize_t i;
ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
bool flipped_bit = false;
fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
posscpus = read_file_at(-EBADF, fpath, PROTECT_OPEN, 0);
if (!posscpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
/* Get maximum number of cpus found in possible cpuset. */
maxposs = get_max_cpus(posscpus);
if (maxposs < 0 || maxposs >= INT_MAX - 1)
return false;
if (file_exists(__ISOL_CPUS)) {
isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
if (!isolcpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
if (isdigit(isolcpus[0])) {
/* Get maximum number of cpus found in isolated cpuset. */
maxisol = get_max_cpus(isolcpus);
if (maxisol < 0 || maxisol >= INT_MAX - 1)
return false;
}
if (maxposs < maxisol)
maxposs = maxisol;
maxposs++;
} else {
TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
}
if (file_exists(__OFFLINE_CPUS)) {
offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
if (!offlinecpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
if (isdigit(offlinecpus[0])) {
/* Get maximum number of cpus found in offline cpuset. */
maxoffline = get_max_cpus(offlinecpus);
if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
return false;
}
if (maxposs < maxoffline)
maxposs = maxoffline;
maxposs++;
} else {
TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
}
if ((maxisol == 0) && (maxoffline == 0)) {
cpulist = move_ptr(posscpus);
goto copy_parent;
}
possmask = lxc_cpumask(posscpus, maxposs);
if (!possmask)
return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
if (maxisol > 0) {
isolmask = lxc_cpumask(isolcpus, maxposs);
if (!isolmask)
return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
}
if (maxoffline > 0) {
offlinemask = lxc_cpumask(offlinecpus, maxposs);
if (!offlinemask)
return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
}
for (i = 0; i <= maxposs; i++) {
if ((isolmask && !is_set(i, isolmask)) ||
(offlinemask && !is_set(i, offlinemask)) ||
!is_set(i, possmask))
continue;
flipped_bit = true;
clear_bit(i, possmask);
}
if (!flipped_bit) {
cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
TRACE("No isolated or offline cpus present in cpuset");
} else {
cpulist = move_ptr(posscpus);
TRACE("Removed isolated or offline cpus from cpuset");
}
if (!cpulist)
return log_error_errno(false, errno, "Failed to create cpu list");
copy_parent:
if (!am_initialized) {
ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
if (ret < 0)
return log_error_errno(false,
errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
child_cgroup);
TRACE("Copied cpu settings of parent cgroup");
}
return true;
}
/* Copy contents of parent(@path)/@file to @path/@file */
static bool copy_parent_file(const char *parent_cgroup,
const char *child_cgroup, const char *file)
{
__do_free char *parent_file = NULL, *value = NULL;
int len = 0;
int ret;
parent_file = must_make_path(parent_cgroup, file, NULL);
len = lxc_read_from_file(parent_file, NULL, 0);
if (len <= 0)
return log_error_errno(false, errno, "Failed to determine buffer size");
value = must_realloc(NULL, len + 1);
value[len] = '\0';
ret = lxc_read_from_file(parent_file, value, len);
if (ret != len)
return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
ret = lxc_write_openat(child_cgroup, file, value, len);
if (ret < 0 && errno != EACCES)
return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
value, child_cgroup, file);
return true;
}
static inline bool is_unified_hierarchy(const struct hierarchy *h) static inline bool is_unified_hierarchy(const struct hierarchy *h)
{ {
return h->version == CGROUP2_SUPER_MAGIC; return h->version == CGROUP2_SUPER_MAGIC;
} }
/*
* Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
* cgroup.clone_children so that children inherit settings. Since the
* h->base_path is populated by init or ourselves, we know it is already
* initialized.
*
* returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
* cgroup.
*/
static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
const char *cgroup_leaf)
{
__do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
__do_close int cgroup_fd = -EBADF;
int fret = -1;
int ret;
char v;
char *leaf, *slash;
if (is_unified_hierarchy(h))
return 0;
if (!string_in_list(h->controllers, "cpuset"))
return 0;
if (!cgroup_leaf)
return ret_set_errno(-1, EINVAL);
dup = strdup(cgroup_leaf);
if (!dup)
return ret_set_errno(-1, ENOMEM);
parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
leaf = dup;
leaf += strspn(leaf, "/");
slash = strchr(leaf, '/');
if (slash)
*slash = '\0';
child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
if (slash)
*slash = '/';
fret = 1;
ret = mkdir(child_cgroup, 0755);
if (ret < 0) {
if (errno != EEXIST)
return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fret = 0;
}
cgroup_fd = lxc_open_dirfd(child_cgroup);
if (cgroup_fd < 0)
return -1;
ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
if (ret < 0)
return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
/* Make sure any isolated cpus are removed from cpuset.cpus. */
if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
return log_error_errno(-1, errno, "Failed to remove isolated cpus");
/* Already set for us by someone else. */
if (v == '1')
TRACE("\"cgroup.clone_children\" was already set to \"1\"");
/* copy parent's settings */
if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
/* Set clone_children so children inherit our settings */
ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
if (ret < 0)
return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
return fret;
}
/* Given two null-terminated lists of strings, return true if any string is in /* Given two null-terminated lists of strings, return true if any string is in
* both. * both.
*/ */
...@@ -691,26 +469,101 @@ static char **cg_unified_get_controllers(int dfd, const char *file) ...@@ -691,26 +469,101 @@ static char **cg_unified_get_controllers(int dfd, const char *file)
return move_ptr(aret); return move_ptr(aret);
} }
static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint, static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
char *container_base_path, int type) char **controllers)
{
if (!ops->cgroup_use)
return true;
for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
bool found = false;
for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
if (!strequal(*cur_use, *cur_ctrl))
continue;
found = true;
break;
}
if (found)
continue;
return false;
}
return true;
}
static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
char *container_base_path, int type)
{ {
struct hierarchy *new; __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
__do_free struct hierarchy *new = NULL;
__do_free_string_list char **controllers = clist;
int newentry; int newentry;
if (abspath(container_base_path))
return syserrno(-errno, "Container base path must be relative to controller mount");
if (!controllers && type != CGROUP2_SUPER_MAGIC)
return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
if (dfd_mnt < 0)
return syserrno(-errno, "Failed to open %s", mountpoint);
if (is_empty_string(container_base_path))
dfd_base = dfd_mnt;
else
dfd_base = open_at(dfd_mnt, container_base_path,
PROTECT_OPATH_DIRECTORY,
PROTECT_LOOKUP_BENEATH_XDEV, 0);
if (dfd_base < 0)
return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
if (!controllers) {
/*
* We assume that the cgroup we're currently in has been delegated to
* us and we are free to further delege all of the controllers listed
* in cgroup.controllers further down the hierarchy.
*/
controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
if (!controllers)
controllers = cg_unified_make_empty_controller();
if (!controllers[0])
TRACE("No controllers are enabled for delegation");
}
/* Exclude all controllers that cgroup use does not want. */
if (!cgroup_use_wants_controllers(ops, controllers))
return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
new = zalloc(sizeof(*new)); new = zalloc(sizeof(*new));
if (!new) if (!new)
return ret_set_errno(NULL, ENOMEM); return ret_errno(ENOMEM);
new->controllers = clist;
new->mountpoint = mountpoint; new->version = type;
new->container_base_path = container_base_path; new->controllers = move_ptr(controllers);
new->version = type; new->mountpoint = mountpoint;
new->cgfd_con = -EBADF; new->container_base_path = container_base_path;
new->cgfd_limit = -EBADF; new->cgfd_con = -EBADF;
new->cgfd_mon = -EBADF; new->cgfd_limit = -EBADF;
new->cgfd_mon = -EBADF;
newentry = append_null_to_list((void ***)h);
(*h)[newentry] = new; TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
return new; mountpoint, container_base_path);
for (char *const *it = new->controllers; it && *it; it++)
TRACE("The detected hierarchy contains the %s controller", *it);
newentry = append_null_to_list((void ***)&ops->hierarchies);
new->dfd_mnt = move_fd(dfd_mnt);
new->dfd_base = move_fd(dfd_base);
if (type == CGROUP2_SUPER_MAGIC)
ops->unified = new;
(ops->hierarchies)[newentry] = move_ptr(new);
return 0;
} }
/* Get a copy of the mountpoint from @line, which is a line from /* Get a copy of the mountpoint from @line, which is a line from
...@@ -788,38 +641,69 @@ static bool controller_in_clist(char *cgline, char *c) ...@@ -788,38 +641,69 @@ static bool controller_in_clist(char *cgline, char *c)
return false; return false;
} }
static inline char *trim(char *s)
{
size_t len;
len = strlen(s);
while ((len > 1) && (s[len - 1] == '\n'))
s[--len] = '\0';
return s;
}
/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
* @controller. * @controller.
*/ */
static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller, static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
int type) char *controller, int type)
{ {
char *p = basecginfo; char *base_cgroup = basecginfo;
for (;;) { for (;;) {
bool is_cgv2_base_cgroup = false; bool is_cgv2_base_cgroup = false;
/* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */ /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0')) if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
is_cgv2_base_cgroup = true; is_cgv2_base_cgroup = true;
p = strchr(p, ':'); base_cgroup = strchr(base_cgroup, ':');
if (!p) if (!base_cgroup)
return NULL; return NULL;
p++; base_cgroup++;
if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
__do_free char *copy = NULL;
if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) { base_cgroup = strchr(base_cgroup, ':');
p = strchr(p, ':'); if (!base_cgroup)
if (!p)
return NULL; return NULL;
p++; base_cgroup++;
return copy_to_eol(p);
copy = copy_to_eol(base_cgroup);
if (!copy)
return NULL;
trim(copy);
if (!relative) {
base_cgroup = prune_init_scope(copy);
if (!base_cgroup)
return NULL;
} else {
base_cgroup = copy;
}
if (abspath(base_cgroup))
base_cgroup = deabs(base_cgroup);
/* We're allowing base_cgroup to be "". */
return strdup(base_cgroup);
} }
p = strchr(p, '\n'); base_cgroup = strchr(base_cgroup, '\n');
if (!p) if (!base_cgroup)
return NULL; return NULL;
p++; base_cgroup++;
} }
} }
...@@ -877,40 +761,6 @@ static int get_existing_subsystems(char ***klist, char ***nlist) ...@@ -877,40 +761,6 @@ static int get_existing_subsystems(char ***klist, char ***nlist)
return 0; return 0;
} }
static char *trim(char *s)
{
size_t len;
len = strlen(s);
while ((len > 1) && (s[len - 1] == '\n'))
s[--len] = '\0';
return s;
}
static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
{
int i;
struct hierarchy **it;
if (!ops->hierarchies) {
TRACE(" No hierarchies found");
return;
}
TRACE(" Hierarchies:");
for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
int j;
char **cit;
TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
TRACE(" controllers:");
for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
TRACE(" %d: %s", j, *cit);
}
}
static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
char **nlist) char **nlist)
{ {
...@@ -1023,118 +873,223 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, ...@@ -1023,118 +873,223 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
} else { } else {
ret = cgroup_tree_remove(ops->hierarchies, ops->container_cgroup); ret = cgroup_tree_remove(ops->hierarchies, ops->container_cgroup);
} }
if (ret < 0) if (ret < 0)
SYSWARN("Failed to destroy cgroups"); SYSWARN("Failed to destroy cgroups");
} }
#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
bool am_initialized)
{
__do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
*offlinecpus = NULL, *posscpus = NULL;
__do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
*possmask = NULL;
int ret;
ssize_t i;
ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
bool flipped_bit = false;
posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
if (!posscpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
/* Get maximum number of cpus found in possible cpuset. */
maxposs = get_max_cpus(posscpus);
if (maxposs < 0 || maxposs >= INT_MAX - 1)
return false;
if (file_exists(__ISOL_CPUS)) {
isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
if (!isolcpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
if (isdigit(isolcpus[0])) {
/* Get maximum number of cpus found in isolated cpuset. */
maxisol = get_max_cpus(isolcpus);
if (maxisol < 0 || maxisol >= INT_MAX - 1)
return false;
}
if (maxposs < maxisol)
maxposs = maxisol;
maxposs++;
} else {
TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
}
if (file_exists(__OFFLINE_CPUS)) {
offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
if (!offlinecpus)
return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
if (isdigit(offlinecpus[0])) {
/* Get maximum number of cpus found in offline cpuset. */
maxoffline = get_max_cpus(offlinecpus);
if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
return false;
}
if (maxposs < maxoffline)
maxposs = maxoffline;
maxposs++;
} else {
TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
}
if ((maxisol == 0) && (maxoffline == 0)) {
cpulist = move_ptr(posscpus);
goto copy_parent;
}
__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, possmask = lxc_cpumask(posscpus, maxposs);
struct lxc_handler *handler) if (!possmask)
{ return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
int len;
char pidstr[INTTYPE_TO_STRLEN(pid_t)];
const struct lxc_conf *conf;
if (!ops) { if (maxisol > 0) {
ERROR("Called with uninitialized cgroup operations"); isolmask = lxc_cpumask(isolcpus, maxposs);
return; if (!isolmask)
return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
} }
if (!ops->hierarchies) if (maxoffline > 0) {
return; offlinemask = lxc_cpumask(offlinecpus, maxposs);
if (!offlinemask)
return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
}
if (!handler) { for (i = 0; i <= maxposs; i++) {
ERROR("Called with uninitialized handler"); if ((isolmask && !is_set(i, isolmask)) ||
return; (offlinemask && !is_set(i, offlinemask)) ||
!is_set(i, possmask))
continue;
flipped_bit = true;
clear_bit(i, possmask);
} }
if (!handler->conf) { if (!flipped_bit) {
ERROR("Called with uninitialized conf"); cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
return; TRACE("No isolated or offline cpus present in cpuset");
} else {
cpulist = move_ptr(posscpus);
TRACE("Removed isolated or offline cpus from cpuset");
} }
conf = handler->conf; if (!cpulist)
return log_error_errno(false, errno, "Failed to create cpu list");
len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid); copy_parent:
if (len < 0) if (!am_initialized) {
return; ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
if (ret < 0)
return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
for (int i = 0; ops->hierarchies[i]; i++) { TRACE("Copied cpu settings of parent cgroup");
__do_free char *pivot_path = NULL; }
struct hierarchy *h = ops->hierarchies[i];
size_t offset;
int ret;
if (!h->monitor_full_path) return true;
continue; }
/* Monitor might have died before we entered the cgroup. */ static bool cpuset1_initialize(int dfd_base, int dfd_next)
if (handler->monitor_pid <= 0) { {
WARN("No valid monitor process found while destroying cgroups"); char mems[PATH_MAX];
goto try_lxc_rm_rf; ssize_t bytes;
} char v;
if (conf->cgroup_meta.monitor_pivot_dir) /*
pivot_path = must_make_path(h->mountpoint, h->container_base_path, * Determine whether the base cgroup has cpuset
conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL); * inheritance turned on.
else if (conf->cgroup_meta.monitor_dir) */
pivot_path = must_make_path(h->mountpoint, h->container_base_path, bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
conf->cgroup_meta.monitor_dir, CGROUP_PIVOT, NULL); if (bytes < 0)
else if (conf->cgroup_meta.dir) return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
pivot_path = must_make_path(h->mountpoint, h->container_base_path,
conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
else
pivot_path = must_make_path(h->mountpoint, h->container_base_path,
CGROUP_PIVOT, NULL);
offset = strlen(h->mountpoint) + strlen(h->container_base_path); /*
* Initialize cpuset.cpus and make remove any isolated
* and offline cpus.
*/
if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
return syserrno(false, "Failed to initialize cpuset.cpus");
if (cg_legacy_handle_cpuset_hierarchy(h, pivot_path + offset)) /* Read cpuset.mems from parent... */
SYSWARN("Failed to initialize cpuset %s/" CGROUP_PIVOT, pivot_path); bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
if (bytes < 0)
return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
ret = mkdir_p(pivot_path, 0755); /* ... and copy to first cgroup in the tree... */
if (ret < 0 && errno != EEXIST) { bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
ERROR("Failed to create %s", pivot_path); if (bytes < 0)
goto try_lxc_rm_rf; return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
}
ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len); /* ... and finally turn on cpuset inheritance. */
if (ret != 0) { bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path); if (bytes < 0)
continue; return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
}
try_lxc_rm_rf: return log_trace(true, "Initialized cpuset in the legacy hierarchy");
ret = lxc_rm_rf(h->monitor_full_path);
if (ret < 0)
WARN("Failed to destroy \"%s\"", h->monitor_full_path);
}
} }
static int mkdir_eexist_on_last(const char *dir, mode_t mode) static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
bool cpuset_v1, bool eexist_ignore)
{ {
const char *tmp = dir; __do_close int dfd_final = -EBADF;
const char *orig = dir; int dfd_cur = dfd_base;
size_t orig_len; int ret = 0;
size_t len;
char *cur;
char buf[PATH_MAX];
orig_len = strlen(dir); if (is_empty_string(path))
do { return ret_errno(-EINVAL);
__do_free char *makeme = NULL;
int ret; len = strlcpy(buf, path, sizeof(buf));
size_t cur_len; if (len >= sizeof(buf))
return -E2BIG;
lxc_iterate_parts(cur, buf, "/") {
/*
* Even though we vetted the paths when we parsed the config
* we're paranoid here and check that the path is neither
* absolute nor walks upwards.
*/
if (abspath(buf))
return syserrno_set(-EINVAL, "No absolute paths allowed");
dir = tmp + strspn(tmp, "/"); if (strnequal(buf, "..", STRLITERALLEN("..")))
tmp = dir + strcspn(dir, "/"); return syserrno_set(-EINVAL, "No upward walking paths allowed");
cur_len = dir - orig; ret = mkdirat(dfd_cur, cur, mode);
makeme = strndup(orig, cur_len); if (ret < 0) {
if (!makeme) if (errno != EEXIST)
return ret_set_errno(-1, ENOMEM); return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
ret = mkdir(makeme, mode); ret = -EEXIST;
if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len))) }
return log_warn_errno(-1, errno, "Failed to create directory \"%s\"", makeme); TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
} while (tmp != dir);
dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
if (dfd_final < 0)
return syserrno(-errno, "Fail to open%s directory %d(%s)",
!ret ? " newly created" : "", dfd_base, cur);
if (dfd_cur != dfd_base)
close(dfd_cur);
else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
/*
* Leave dfd_final pointing to the last fd we opened so
* it will be automatically zapped if we return early.
*/
dfd_cur = dfd_final;
}
return 0; /* The final cgroup must be succesfully creatd by us. */
if (ret) {
if (ret != -EEXIST || !eexist_ignore)
return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
}
return move_fd(dfd_final);
} }
static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
...@@ -1142,34 +1097,27 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, ...@@ -1142,34 +1097,27 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
const char *cgroup_leaf, bool payload, const char *cgroup_leaf, bool payload,
const char *cgroup_limit_dir) const char *cgroup_limit_dir)
{ {
__do_close int fd_limit = -EBADF, fd_final = -EBADF;
__do_free char *path = NULL, *limit_path = NULL; __do_free char *path = NULL, *limit_path = NULL;
int ret, ret_cpuset; bool cpuset_v1 = false;
path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL); /* Don't bother with all the rest if the final cgroup already exists. */
if (dir_exists(path)) if (exists_dir_at(h->dfd_base, cgroup_leaf))
return log_warn_errno(false, errno, "The %s cgroup already existed", path); return syswarn(false, "The %d(%s) cgroup already existed", h->dfd_base, cgroup_leaf);
ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf); /*
if (ret_cpuset < 0) * The legacy cpuset controller needs massaging in case inheriting
return log_error_errno(false, errno, "Failed to handle legacy cpuset controller"); * settings from its immediate ancestor cgroup hasn't been turned on.
*/
cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
if (payload && cgroup_limit_dir) { if (payload && cgroup_limit_dir) {
/* with isolation both parts need to not already exist */ /* With isolation both parts need to not already exist. */
limit_path = must_make_path(h->mountpoint, fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
h->container_base_path, if (fd_limit < 0)
cgroup_limit_dir, NULL); return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
ret = mkdir_eexist_on_last(limit_path, 0755); limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
if (ret < 0)
return log_debug_errno(false,
errno, "Failed to create %s limiting cgroup",
limit_path);
h->cgfd_limit = lxc_open_dirfd(limit_path);
if (h->cgfd_limit < 0)
return log_error_errno(false, errno,
"Failed to open %s", path);
h->container_limit_path = move_ptr(limit_path);
/* /*
* With isolation the devices legacy cgroup needs to be * With isolation the devices legacy cgroup needs to be
...@@ -1182,30 +1130,26 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, ...@@ -1182,30 +1130,26 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
return log_error(false, "Failed to setup legacy device limits"); return log_error(false, "Failed to setup legacy device limits");
} }
ret = mkdir_eexist_on_last(path, 0755); fd_final = __cgroup_tree_create(h->dfd_base, cgroup_leaf, 0755, cpuset_v1, false);
if (ret < 0) { if (fd_final < 0)
/* return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
* This is the cpuset controller and
* cg_legacy_handle_cpuset_hierarchy() has created our target
* directory for us to ensure correct initialization.
*/
if (ret_cpuset != 1 || cgroup_tree)
return log_debug_errno(false, errno, "Failed to create %s cgroup", path);
}
path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
if (payload) { if (payload) {
h->cgfd_con = lxc_open_dirfd(path); h->cgfd_con = move_fd(fd_final);
if (h->cgfd_con < 0)
return log_error_errno(false, errno, "Failed to open %s", path);
h->container_full_path = move_ptr(path); h->container_full_path = move_ptr(path);
if (h->cgfd_limit < 0)
if (fd_limit < 0)
h->cgfd_limit = h->cgfd_con; h->cgfd_limit = h->cgfd_con;
if (!h->container_limit_path) else
h->cgfd_limit = move_fd(fd_limit);
if (!limit_path)
h->container_limit_path = h->container_full_path; h->container_limit_path = h->container_full_path;
else
h->container_limit_path = move_ptr(limit_path);
} else { } else {
h->cgfd_mon = lxc_open_dirfd(path); h->cgfd_mon = move_fd(fd_final);
if (h->cgfd_mon < 0)
return log_error_errno(false, errno, "Failed to open %s", path);
h->monitor_full_path = move_ptr(path); h->monitor_full_path = move_ptr(path);
} }
...@@ -1234,6 +1178,82 @@ static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload) ...@@ -1234,6 +1178,82 @@ static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload)
SYSWARN("Failed to rmdir(\"%s\") cgroup", limit_path); SYSWARN("Failed to rmdir(\"%s\") cgroup", limit_path);
} }
__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
struct lxc_handler *handler)
{
int len;
char pidstr[INTTYPE_TO_STRLEN(pid_t)];
const struct lxc_conf *conf;
if (!ops) {
ERROR("Called with uninitialized cgroup operations");
return;
}
if (!ops->hierarchies)
return;
if (!handler) {
ERROR("Called with uninitialized handler");
return;
}
if (!handler->conf) {
ERROR("Called with uninitialized conf");
return;
}
conf = handler->conf;
len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
if (len < 0)
return;
for (int i = 0; ops->hierarchies[i]; i++) {
__do_close int fd_pivot = -EBADF;
__do_free char *pivot_path = NULL;
struct hierarchy *h = ops->hierarchies[i];
bool cpuset_v1 = false;
int ret;
if (!h->monitor_full_path)
continue;
/* Monitor might have died before we entered the cgroup. */
if (handler->monitor_pid <= 0) {
WARN("No valid monitor process found while destroying cgroups");
goto try_lxc_rm_rf;
}
if (conf->cgroup_meta.monitor_pivot_dir)
pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
else if (conf->cgroup_meta.monitor_dir)
pivot_path = must_make_path(conf->cgroup_meta.monitor_dir, CGROUP_PIVOT, NULL);
else if (conf->cgroup_meta.dir)
pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
else
pivot_path = must_make_path(CGROUP_PIVOT, NULL);
cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
if (fd_pivot < 0) {
SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
continue;
}
ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
if (ret != 0) {
SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
continue;
}
try_lxc_rm_rf:
ret = lxc_rm_rf(h->monitor_full_path);
if (ret < 0)
WARN("Failed to destroy \"%s\"", h->monitor_full_path);
}
}
/* /*
* Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
* proper prefix directory of lxc.cgroup.dir.payload. * proper prefix directory of lxc.cgroup.dir.payload.
...@@ -1332,7 +1352,7 @@ __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lx ...@@ -1332,7 +1352,7 @@ __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lx
monitor_cgroup, false, NULL)) monitor_cgroup, false, NULL))
continue; continue;
DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)"); DEBUG("Failed to create cgroup \"%s\"", maybe_empty(ops->hierarchies[i]->monitor_full_path));
for (int j = 0; j < i; j++) for (int j = 0; j < i; j++)
cgroup_tree_leaf_remove(ops->hierarchies[j], false); cgroup_tree_leaf_remove(ops->hierarchies[j], false);
...@@ -3251,32 +3271,6 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops * ...@@ -3251,32 +3271,6 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *
return __cgfsng_delegate_controllers(ops, ops->container_cgroup); return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
} }
static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
char **controllers)
{
if (!ops->cgroup_use)
return true;
for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
bool found = false;
for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
if (!strequal(*cur_use, *cur_ctrl))
continue;
found = true;
break;
}
if (found)
continue;
return false;
}
return true;
}
static void cg_unified_delegate(char ***delegate) static void cg_unified_delegate(char ***delegate)
{ {
__do_free char *buf = NULL; __do_free char *buf = NULL;
...@@ -3343,7 +3337,6 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg ...@@ -3343,7 +3337,6 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg
__do_free_string_list char **controller_list = NULL; __do_free_string_list char **controller_list = NULL;
int type; int type;
bool writeable; bool writeable;
struct hierarchy *new;
type = get_cgroup_version(line); type = get_cgroup_version(line);
if (type == 0) if (type == 0)
...@@ -3382,16 +3375,14 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg ...@@ -3382,16 +3375,14 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg
} }
if (type == CGROUP_SUPER_MAGIC) if (type == CGROUP_SUPER_MAGIC)
base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC); base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
else else
base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC); base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
if (!base_cgroup) { if (!base_cgroup) {
WARN("Failed to find current cgroup"); WARN("Failed to find current cgroup");
continue; continue;
} }
trim(base_cgroup);
prune_init_scope(base_cgroup);
if (type == CGROUP2_SUPER_MAGIC) if (type == CGROUP2_SUPER_MAGIC)
writeable = test_writeable_v2(mountpoint, base_cgroup); writeable = test_writeable_v2(mountpoint, base_cgroup);
else else
...@@ -3401,41 +3392,16 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg ...@@ -3401,41 +3392,16 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg
continue; continue;
} }
if (type == CGROUP2_SUPER_MAGIC) { if (type == CGROUP2_SUPER_MAGIC)
char *cgv2_ctrl_path; ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
else
cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup, ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
"cgroup.controllers", if (ret)
NULL); return syserrno(ret, "Failed to add cgroup hierarchy");
if (ops->unified && unprivileged)
controller_list = cg_unified_get_controllers(-EBADF, cgv2_ctrl_path); cg_unified_delegate(&(ops->unified)->cgroup2_chown);
free(cgv2_ctrl_path);
if (!controller_list) {
controller_list = cg_unified_make_empty_controller();
TRACE("No controllers are enabled for "
"delegation in the unified hierarchy");
}
}
/* Exclude all controllers that cgroup use does not want. */
if (!cgroup_use_wants_controllers(ops, controller_list)) {
TRACE("Skipping controller");
continue;
}
new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
if (!new)
return log_error_errno(-1, errno, "Failed to add cgroup hierarchy");
if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
if (unprivileged)
cg_unified_delegate(&new->cgroup2_chown);
ops->unified = new;
}
} }
TRACE("Writable cgroup hierarchies:");
lxc_cgfsng_print_hierarchies(ops);
/* verify that all controllers in cgroup.use and all crucial /* verify that all controllers in cgroup.use and all crucial
* controllers are accounted for * controllers are accounted for
*/ */
...@@ -3448,8 +3414,7 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg ...@@ -3448,8 +3414,7 @@ static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileg
/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */ /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
static char *cg_unified_get_current_cgroup(bool relative) static char *cg_unified_get_current_cgroup(bool relative)
{ {
__do_free char *basecginfo = NULL; __do_free char *basecginfo = NULL, *copy = NULL;
char *copy;
char *base_cgroup; char *base_cgroup;
if (!relative && (geteuid() == 0)) if (!relative && (geteuid() == 0))
...@@ -3467,48 +3432,32 @@ static char *cg_unified_get_current_cgroup(bool relative) ...@@ -3467,48 +3432,32 @@ static char *cg_unified_get_current_cgroup(bool relative)
copy = copy_to_eol(base_cgroup); copy = copy_to_eol(base_cgroup);
if (!copy) if (!copy)
return NULL; return NULL;
trim(copy);
if (!relative) {
base_cgroup = prune_init_scope(copy);
if (!base_cgroup)
return NULL;
} else {
base_cgroup = copy;
}
if (abspath(base_cgroup))
base_cgroup = deabs(base_cgroup);
return trim(copy); /* We're allowing base_cgroup to be "". */
return strdup(base_cgroup);
} }
static int cg_unified_init(struct cgroup_ops *ops, bool relative, static int cg_unified_init(struct cgroup_ops *ops, bool relative,
bool unprivileged) bool unprivileged)
{ {
__do_close int cgroup_root_fd = -EBADF; __do_free char *base_cgroup = NULL;
__do_free char *base_cgroup = NULL, *controllers_path = NULL;
__do_free_string_list char **delegatable = NULL;
__do_free struct hierarchy *new = NULL;
int ret; int ret;
ret = unified_cgroup_hierarchy();
if (ret == -ENOMEDIUM)
return ret_errno(ENOMEDIUM);
if (ret != CGROUP2_SUPER_MAGIC)
return 0;
base_cgroup = cg_unified_get_current_cgroup(relative); base_cgroup = cg_unified_get_current_cgroup(relative);
if (!base_cgroup) if (!base_cgroup)
return ret_errno(EINVAL); return ret_errno(EINVAL);
if (!relative)
prune_init_scope(base_cgroup);
cgroup_root_fd = openat(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
O_NOCTTY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY);
if (cgroup_root_fd < 0)
return -errno;
/*
* We assume that the cgroup we're currently in has been delegated to
* us and we are free to further delege all of the controllers listed
* in cgroup.controllers further down the hierarchy.
*/
controllers_path = must_make_path_relative(base_cgroup, "cgroup.controllers", NULL);
delegatable = cg_unified_get_controllers(cgroup_root_fd, controllers_path);
if (!delegatable)
delegatable = cg_unified_make_empty_controller();
if (!delegatable[0])
TRACE("No controllers are enabled for delegation");
/* TODO: If the user requested specific controllers via lxc.cgroup.use /* TODO: If the user requested specific controllers via lxc.cgroup.use
* we should verify here. The reason I'm not doing it right is that I'm * we should verify here. The reason I'm not doing it right is that I'm
...@@ -3517,31 +3466,41 @@ static int cg_unified_init(struct cgroup_ops *ops, bool relative, ...@@ -3517,31 +3466,41 @@ static int cg_unified_init(struct cgroup_ops *ops, bool relative,
* controllers per container. * controllers per container.
*/ */
new = add_hierarchy(&ops->hierarchies, ret = add_hierarchy(ops, NULL,
move_ptr(delegatable),
must_copy_string(DEFAULT_CGROUP_MOUNTPOINT), must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
move_ptr(base_cgroup), move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
CGROUP2_SUPER_MAGIC); if (ret)
if (!new) return syserrno(ret, "Failed to add unified cgroup hierarchy");
return log_error_errno(-1, errno, "Failed to add unified cgroup hierarchy");
if (unprivileged) if (unprivileged)
cg_unified_delegate(&new->cgroup2_chown); cg_unified_delegate(&(ops->unified)->cgroup2_chown);
if (bpf_devices_cgroup_supported()) if (bpf_devices_cgroup_supported())
new->bpf_device_controller = 1; ops->unified->bpf_device_controller = 1;
ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
ops->unified = move_ptr(new);
return CGROUP2_SUPER_MAGIC; return CGROUP2_SUPER_MAGIC;
} }
static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf) static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
{ {
__do_close int dfd = -EBADF;
bool relative = conf->cgroup_meta.relative;
int ret; int ret;
const char *tmp; const char *tmp;
bool relative = conf->cgroup_meta.relative;
if (ops->dfd_mnt_cgroupfs_host >= 0)
return ret_errno(EINVAL);
/*
* I don't see the need for allowing symlinks here. If users want to
* have their hierarchy available in different locations I strongly
* suggest bind-mounts.
*/
dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
if (dfd < 0)
return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
tmp = lxc_global_config_value("lxc.cgroup.use"); tmp = lxc_global_config_value("lxc.cgroup.use");
if (tmp) { if (tmp) {
...@@ -3555,14 +3514,23 @@ static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf) ...@@ -3555,14 +3514,23 @@ static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
must_append_string(&ops->cgroup_use, cur); must_append_string(&ops->cgroup_use, cur);
} }
ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map)); /*
if (ret < 0) * Keep dfd referenced by the cleanup function and actually move the fd
return -1; * once we know the initialization succeeded. So if we fail we clean up
* the dfd.
*/
ops->dfd_mnt_cgroupfs_host = dfd;
if (ret == CGROUP2_SUPER_MAGIC) if (unified_cgroup_fd(dfd))
return 0; ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
else
ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
if (ret < 0)
return syserrno(ret, "Failed to initialize cgroups");
return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map)); /* Transfer ownership to cgroup_ops. */
move_fd(dfd);
return 0;
} }
__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
...@@ -3589,8 +3557,9 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf) ...@@ -3589,8 +3557,9 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
return ret_set_errno(NULL, ENOMEM); return ret_set_errno(NULL, ENOMEM);
cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
if (cg_init(cgfsng_ops, conf)) if (__cgroup_init(cgfsng_ops, conf))
return NULL; return NULL;
cgfsng_ops->data_init = cgfsng_data_init; cgfsng_ops->data_init = cgfsng_data_init;
......
...@@ -33,10 +33,14 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf) ...@@ -33,10 +33,14 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf)
if (!cgroup_ops) if (!cgroup_ops)
return log_error_errno(NULL, errno, "Failed to initialize cgroup driver"); return log_error_errno(NULL, errno, "Failed to initialize cgroup driver");
if (!cgroup_ops->hierarchies) {
cgroup_exit(cgroup_ops);
return log_error_errno(NULL, ENOENT, "No cgroup hierarchies found");
}
if (cgroup_ops->data_init(cgroup_ops)) { if (cgroup_ops->data_init(cgroup_ops)) {
cgroup_exit(cgroup_ops); cgroup_exit(cgroup_ops);
return log_error_errno(NULL, errno, return log_error_errno(NULL, errno, "Failed to initialize cgroup data");
"Failed to initialize cgroup data");
} }
TRACE("Initialized cgroup driver %s", cgroup_ops->driver); TRACE("Initialized cgroup driver %s", cgroup_ops->driver);
...@@ -68,6 +72,9 @@ void cgroup_exit(struct cgroup_ops *ops) ...@@ -68,6 +72,9 @@ void cgroup_exit(struct cgroup_ops *ops)
if (ops->cgroup2_devices) if (ops->cgroup2_devices)
bpf_program_free(ops->cgroup2_devices); bpf_program_free(ops->cgroup2_devices);
if (ops->dfd_mnt_cgroupfs_host >= 0)
close(ops->dfd_mnt_cgroupfs_host);
for (struct hierarchy **it = ops->hierarchies; it && *it; it++) { for (struct hierarchy **it = ops->hierarchies; it && *it; it++) {
for (char **p = (*it)->controllers; p && *p; p++) for (char **p = (*it)->controllers; p && *p; p++)
free(*p); free(*p);
...@@ -79,12 +86,34 @@ void cgroup_exit(struct cgroup_ops *ops) ...@@ -79,12 +86,34 @@ void cgroup_exit(struct cgroup_ops *ops)
free((*it)->mountpoint); free((*it)->mountpoint);
free((*it)->container_base_path); free((*it)->container_base_path);
free((*it)->container_full_path);
free((*it)->monitor_full_path); {
if ((*it)->cgfd_con >= 0) free((*it)->container_full_path);
close((*it)->cgfd_con);
if ((*it)->container_full_path != (*it)->container_limit_path)
free((*it)->monitor_full_path);
}
{
if ((*it)->cgfd_limit >= 0 && (*it)->cgfd_con != (*it)->cgfd_limit)
close((*it)->cgfd_limit);
if ((*it)->cgfd_con >= 0)
close((*it)->cgfd_con);
}
if ((*it)->cgfd_mon >= 0) if ((*it)->cgfd_mon >= 0)
close((*it)->cgfd_mon); close((*it)->cgfd_mon);
{
if ((*it)->dfd_base >= 0 && (*it)->dfd_mnt != (*it)->dfd_base)
close((*it)->dfd_base);
if ((*it)->dfd_mnt >= 0)
close((*it)->dfd_mnt);
}
free(*it); free(*it);
} }
free(ops->hierarchies); free(ops->hierarchies);
...@@ -95,21 +124,13 @@ void cgroup_exit(struct cgroup_ops *ops) ...@@ -95,21 +124,13 @@ void cgroup_exit(struct cgroup_ops *ops)
} }
#define INIT_SCOPE "/init.scope" #define INIT_SCOPE "/init.scope"
void prune_init_scope(char *cg) char *prune_init_scope(char *cg)
{ {
char *point; if (is_empty_string(cg))
return NULL;
if (!cg) if (strnequal(cg, INIT_SCOPE, STRLITERALLEN(INIT_SCOPE)))
return; return cg + STRLITERALLEN(INIT_SCOPE);
point = cg + strlen(cg) - strlen(INIT_SCOPE);
if (point < cg)
return;
if (strequal(point, INIT_SCOPE)) { return cg;
if (point == cg)
*(point + 1) = '\0';
else
*point = '\0';
}
} }
...@@ -91,12 +91,24 @@ struct hierarchy { ...@@ -91,12 +91,24 @@ struct hierarchy {
unsigned int bpf_device_controller:1; unsigned int bpf_device_controller:1;
unsigned int freezer_controller:1; unsigned int freezer_controller:1;
/* container cgroup fd */ /* File descriptor for the container's cgroup @container_full_path. */
int cgfd_con; int cgfd_con;
/* limiting cgroup fd (may be equal to cgfd_con if not separated) */
/*
* File descriptor for the container's limiting cgroup
* @container_limit_path.
* Will be equal to @cgfd_con if no limiting cgroup has been requested.
*/
int cgfd_limit; int cgfd_limit;
/* monitor cgroup fd */
/* File descriptor for the monitor's cgroup @monitor_full_path. */
int cgfd_mon; int cgfd_mon;
/* File descriptor for the controller's mountpoint @mountpoint. */
int dfd_mnt;
/* File descriptor for the controller's base cgroup path @container_base_path. */
int dfd_base;
}; };
struct cgroup_ops { struct cgroup_ops {
...@@ -106,6 +118,18 @@ struct cgroup_ops { ...@@ -106,6 +118,18 @@ struct cgroup_ops {
/* string constant */ /* string constant */
const char *version; const char *version;
/*
* File descriptor for the host's cgroupfs mount. On
* CGROUP_LAYOUT_LEGACY or CGROUP_LAYOUT_HYBRID hybrid systems
* @dfd_mnt_cgroupfs_host will be a tmpfs fd and the individual
* controllers will be cgroupfs fds. On CGROUP_LAYOUT_UNIFIED it will
* be a cgroupfs fd itself.
*
* So for CGROUP_LAYOUT_LEGACY or CGROUP_LAYOUT_HYBRID we allow
* mountpoint crossing iff we cross from a tmpfs into a cgroupfs mount.
* */
int dfd_mnt_cgroupfs_host;
/* What controllers is the container supposed to use. */ /* What controllers is the container supposed to use. */
char **cgroup_use; char **cgroup_use;
char *cgroup_pattern; char *cgroup_pattern;
...@@ -186,7 +210,7 @@ __hidden extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf); ...@@ -186,7 +210,7 @@ __hidden extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf);
__hidden extern void cgroup_exit(struct cgroup_ops *ops); __hidden extern void cgroup_exit(struct cgroup_ops *ops);
define_cleanup_function(struct cgroup_ops *, cgroup_exit); define_cleanup_function(struct cgroup_ops *, cgroup_exit);
__hidden extern void prune_init_scope(char *cg); __hidden extern char *prune_init_scope(char *cg);
__hidden extern int cgroup_attach(const struct lxc_conf *conf, const char *name, __hidden extern int cgroup_attach(const struct lxc_conf *conf, const char *name,
const char *lxcpath, pid_t pid); const char *lxcpath, pid_t pid);
......
...@@ -83,22 +83,6 @@ bool test_writeable_v2(char *mountpoint, char *path) ...@@ -83,22 +83,6 @@ bool test_writeable_v2(char *mountpoint, char *path)
return (access(cgroup_threads_file, W_OK) == 0); return (access(cgroup_threads_file, W_OK) == 0);
} }
int unified_cgroup_hierarchy(void)
{
int ret;
struct statfs fs;
ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs);
if (ret < 0)
return -ENOMEDIUM;
if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
return CGROUP2_SUPER_MAGIC;
return 0;
}
int unified_cgroup_fd(int fd) int unified_cgroup_fd(int fd)
{ {
......
...@@ -29,8 +29,6 @@ __hidden extern bool test_writeable_v1(char *mountpoint, char *path); ...@@ -29,8 +29,6 @@ __hidden extern bool test_writeable_v1(char *mountpoint, char *path);
*/ */
__hidden extern bool test_writeable_v2(char *mountpoint, char *path); __hidden extern bool test_writeable_v2(char *mountpoint, char *path);
__hidden extern int unified_cgroup_hierarchy(void);
__hidden extern int unified_cgroup_fd(int fd); __hidden extern int unified_cgroup_fd(int fd);
static inline bool cgns_supported(void) static inline bool cgns_supported(void)
......
...@@ -31,15 +31,15 @@ int lxc_readat(int dirfd, const char *filename, void *buf, size_t count) ...@@ -31,15 +31,15 @@ int lxc_readat(int dirfd, const char *filename, void *buf, size_t count)
__do_close int fd = -EBADF; __do_close int fd = -EBADF;
ssize_t ret; ssize_t ret;
fd = openat(dirfd, filename, O_RDONLY | O_CLOEXEC); fd = open_at(dirfd, filename, PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
if (fd < 0) if (fd < 0)
return -1; return -errno;
ret = lxc_read_nointr(fd, buf, count); ret = lxc_read_nointr(fd, buf, count);
if (ret < 0 || (size_t)ret != count) if (ret < 0)
return -1; return -errno;
return 0; return ret;
} }
int lxc_writeat(int dirfd, const char *filename, const void *buf, size_t count) int lxc_writeat(int dirfd, const char *filename, const void *buf, size_t count)
...@@ -630,21 +630,31 @@ int timens_offset_write(clockid_t clk_id, int64_t s_offset, int64_t ns_offset) ...@@ -630,21 +630,31 @@ int timens_offset_write(clockid_t clk_id, int64_t s_offset, int64_t ns_offset)
bool exists_dir_at(int dir_fd, const char *path) bool exists_dir_at(int dir_fd, const char *path)
{ {
struct stat sb;
int ret; int ret;
struct stat sb;
ret = fstatat(dir_fd, path, &sb, 0); ret = fstatat(dir_fd, path, &sb, 0);
if (ret < 0) if (ret < 0)
return false; return false;
return S_ISDIR(sb.st_mode); ret = S_ISDIR(sb.st_mode);
if (ret)
errno = EEXIST;
else
errno = ENOTDIR;
return ret;
} }
bool exists_file_at(int dir_fd, const char *path) bool exists_file_at(int dir_fd, const char *path)
{ {
int ret;
struct stat sb; struct stat sb;
return fstatat(dir_fd, path, &sb, 0) == 0; ret = fstatat(dir_fd, path, &sb, 0);
if (ret == 0)
errno = EEXIST;
return ret == 0;
} }
int open_at(int dfd, const char *path, unsigned int o_flags, int open_at(int dfd, const char *path, unsigned int o_flags,
......
...@@ -501,6 +501,20 @@ __lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \ ...@@ -501,6 +501,20 @@ __lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \
__internal_ret__; \ __internal_ret__; \
}) })
#define syswarn(__ret__, format, ...) \
({ \
typeof(__ret__) __internal_ret__ = (__ret__); \
SYSWARN(format, ##__VA_ARGS__); \
__internal_ret__; \
})
#define sysdebug(__ret__, format, ...) \
({ \
typeof(__ret__) __internal_ret__ = (__ret__); \
SYSDEBUG(format, ##__VA_ARGS__); \
__internal_ret__; \
})
#define syserrno_set(__ret__, format, ...) \ #define syserrno_set(__ret__, format, ...) \
({ \ ({ \
typeof(__ret__) __internal_ret__ = (__ret__); \ typeof(__ret__) __internal_ret__ = (__ret__); \
......
...@@ -813,6 +813,8 @@ char *must_make_path(const char *first, ...) ...@@ -813,6 +813,8 @@ char *must_make_path(const char *first, ...)
va_start(args, first); va_start(args, first);
while ((cur = va_arg(args, char *)) != NULL) { while ((cur = va_arg(args, char *)) != NULL) {
buf_len = strlen(cur); buf_len = strlen(cur);
if (buf_len == 0)
continue;
full_len += buf_len; full_len += buf_len;
if (cur[0] != '/') if (cur[0] != '/')
......
...@@ -150,6 +150,11 @@ static inline bool abspath(const char *str) ...@@ -150,6 +150,11 @@ static inline bool abspath(const char *str)
return *str == '/'; return *str == '/';
} }
static inline char *deabs(char *str)
{
return str + strspn(str, "/");
}
#define strnprintf(buf, buf_size, ...) \ #define strnprintf(buf, buf_size, ...) \
({ \ ({ \
int __ret_strnprintf; \ int __ret_strnprintf; \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment