Unverified Commit 7e925736 by Stéphane Graber Committed by GitHub

Merge pull request #3673 from brauner/2021-02-15/nesting

cgroups: first batch of cgroup mounting fixes
parents 4b946797 a3e5ec26
......@@ -27,7 +27,8 @@
#include "af_unix.h"
#include "attach.h"
#include "caps.h"
#include "cgroup.h"
#include "cgroups/cgroup.h"
#include "cgroups/cgroup_utils.h"
#include "commands.h"
#include "conf.h"
#include "config.h"
......
......@@ -1709,16 +1709,16 @@ __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
}
/* cgroup-full:* is done, no need to create subdirs */
static inline bool cg_mount_needs_subdirs(int type)
static inline bool cg_mount_needs_subdirs(int cg_flags)
{
return !(type >= LXC_AUTO_CGROUP_FULL_RO);
return !(cg_flags >= LXC_AUTO_CGROUP_FULL_RO);
}
/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
* remount controller ro if needed and bindmount the cgroupfs onto
* control/the/cg/path.
*/
static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
static int cg_legacy_mount_controllers(int cg_flags, struct hierarchy *h,
char *controllerpath, char *cgpath,
const char *container_cgroup)
{
......@@ -1726,7 +1726,7 @@ static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
int ret, remount_flags;
int flags = MS_BIND;
if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
if ((cg_flags & LXC_AUTO_CGROUP_RO) || (cg_flags & LXC_AUTO_CGROUP_MIXED)) {
ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
if (ret < 0)
return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
......@@ -1746,7 +1746,7 @@ static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
sourcepath = must_make_path(h->mountpoint, h->container_base_path,
container_cgroup, NULL);
if (type == LXC_AUTO_CGROUP_RO)
if ((cg_flags & LXC_AUTO_CGROUP_RO))
flags |= MS_RDONLY;
ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
......@@ -1768,15 +1768,15 @@ static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
return 0;
}
/* __cg_mount_direct
/* __cgroupfs_mount
*
* Mount cgroup hierarchies directly without using bind-mounts. The main
* uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
* cgroups for the LXC_AUTO_CGROUP_FULL option.
*/
static int __cg_mount_direct(int type, struct hierarchy *h,
struct lxc_rootfs *rootfs,
int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
static int __cgroupfs_mount(int cg_flags, struct hierarchy *h,
struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
const char *hierarchy_mnt)
{
__do_close int fd_fs = -EBADF;
unsigned int flags = 0;
......@@ -1791,7 +1791,8 @@ static int __cg_mount_direct(int type, struct hierarchy *h,
flags |= MOUNT_ATTR_NODEV;
flags |= MOUNT_ATTR_RELATIME;
if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
if ((cg_flags & LXC_AUTO_CGROUP_RO) ||
(cg_flags & LXC_AUTO_CGROUP_FULL_RO))
flags |= MOUNT_ATTR_RDONLY;
if (is_unified_hierarchy(h)) {
......@@ -1847,31 +1848,32 @@ static int __cg_mount_direct(int type, struct hierarchy *h,
return 0;
}
static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
struct lxc_rootfs *rootfs,
int dfd_mnt_cgroupfs,
const char *hierarchy_mnt)
static inline int cgroupfs_mount(int cg_flags, struct hierarchy *h,
struct lxc_rootfs *rootfs,
int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
{
return __cg_mount_direct(type, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
return __cgroupfs_mount(cg_flags, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
}
static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
struct lxc_rootfs *rootfs,
int dfd_mnt_cgroupfs,
const char *hierarchy_mnt)
static inline int cgroupfs_bind_mount(int cg_flags, struct hierarchy *h,
struct lxc_rootfs *rootfs,
int dfd_mnt_cgroupfs,
const char *hierarchy_mnt)
{
if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
if (!(cg_flags & LXC_AUTO_CGROUP_FULL_RO) &&
!(cg_flags & LXC_AUTO_CGROUP_FULL_MIXED))
return 0;
return __cg_mount_direct(type, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
return __cgroupfs_mount(cg_flags, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
}
__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
struct lxc_conf *conf, int type)
struct lxc_handler *handler, int cg_flags)
{
__do_close int dfd_mnt_cgroupfs = -EBADF, fd_fs = -EBADF;
__do_free char *cgroup_root = NULL;
bool has_cgns = false, wants_force_mount = false;
bool in_cgroup_ns = false, wants_force_mount = false;
struct lxc_conf *conf = handler->conf;
struct lxc_rootfs *rootfs = &conf->rootfs;
const char *rootfs_mnt = get_rootfs_mnt(rootfs);
int ret;
......@@ -1885,13 +1887,11 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
if (!conf)
return ret_set_errno(false, EINVAL);
if ((type & LXC_AUTO_CGROUP_MASK) == 0)
return true;
if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
return log_trace(true, "No cgroup mounts requested");
if (type & LXC_AUTO_CGROUP_FORCE) {
type &= ~LXC_AUTO_CGROUP_FORCE;
if (cg_flags & LXC_AUTO_CGROUP_FORCE)
wants_force_mount = true;
}
if (!wants_force_mount) {
wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
......@@ -1910,14 +1910,30 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
wants_force_mount = true;
}
has_cgns = cgns_supported();
if (has_cgns && !wants_force_mount)
return true;
if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP)) {
in_cgroup_ns = true;
/*
* When cgroup namespaces are supported and used by the
* container the LXC_AUTO_CGROUP_MIXED and
* LXC_AUTO_CGROUP_FULL_MIXED auto mount options don't apply
* since the parent directory of the container's cgroup is not
* accessible to the container.
*/
cg_flags &= ~LXC_AUTO_CGROUP_MIXED;
cg_flags &= ~LXC_AUTO_CGROUP_FULL_MIXED;
}
if (type == LXC_AUTO_CGROUP_NOSPEC)
type = LXC_AUTO_CGROUP_MIXED;
else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
type = LXC_AUTO_CGROUP_FULL_MIXED;
if (in_cgroup_ns && !wants_force_mount)
return log_trace(true, "Mounting cgroups not requested or needed");
/*
* Fallback to a mixed layout when the user did not specify what cgroup
* layout they want.
*/
if ((cg_flags & LXC_AUTO_CGROUP_NOSPEC))
cg_flags = LXC_AUTO_CGROUP_MIXED;
else if (cg_flags & LXC_AUTO_CGROUP_FULL_NOSPEC)
cg_flags = LXC_AUTO_CGROUP_FULL_MIXED;
/* This is really the codepath that we want. */
if (pure_unified_layout(ops)) {
......@@ -1929,16 +1945,70 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
return log_error_errno(-errno, errno, "Failed to open %d(%s)",
rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
if (has_cgns && wants_force_mount) {
/*
* If cgroup namespaces are supported but the container will
* not have CAP_SYS_ADMIN after it has started we need to mount
* the cgroups manually.
*
* Note that here we know that wants_force_mount is true.
* Otherwise we would've returned early above.
*/
if (in_cgroup_ns) {
/*
* If cgroup namespaces are supported but the container
* will not have CAP_SYS_ADMIN after it has started we
* need to mount the cgroups manually.
* 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
* 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
* 3. cgroup:mixed:force -> See comment above how this
* does not apply so
* cgroup:mixed is equal to
* cgroup:rw when cgroup
* namespaces are supported.
* 4. cgroup:rw -> No-op; init system responsible for mounting.
* 5. cgroup:ro -> No-op; init system responsible for mounting.
* 6. cgroup:mixed -> No-op; init system responsible for mounting.
*
* 7. cgroup-full:rw -> Not supported.
* 8. cgroup-full:ro -> Not supported.
* 9. cgroup-full:mixed -> Not supported.
* 10. cgroup-full:rw:force -> Not supported.
* 11. cgroup-full:ro:force -> Not supported.
* 12. cgroup-full:mixed:force -> Not supported.
*/
ret = cgroupfs_mount(cg_flags, ops->unified, rootfs, dfd_mnt_cgroupfs, "");
if (ret < 0)
return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
} else {
/*
* Either no cgroup namespace supported (highly
* unlikely unless we're dealing with a Frankenkernel.
* Or the user requested to keep the cgroup namespace
* of the host or another container.
*/
return cg_mount_in_cgroup_namespace(type, ops->unified, rootfs, dfd_mnt_cgroupfs, "") == 0;
if (wants_force_mount) {
/*
* 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
* 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
* 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
* and make the parent directory of the
* container's cgroup read-only but the
* container's cgroup writable.
*
* 10. cgroup-full:rw:force ->
* 11. cgroup-full:ro:force ->
* 12. cgroup-full:mixed:force ->
*/
errno = EOPNOTSUPP;
SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
} else {
errno = EOPNOTSUPP;
SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
}
}
return cg_mount_cgroup_full(type, ops->unified, rootfs, dfd_mnt_cgroupfs, "") == 0;
return syserrno(false, "Failed to mount cgroups");
}
/*
......@@ -1994,13 +2064,13 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
if (ret < 0)
return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_cgroupfs, controller);
if (has_cgns && wants_force_mount) {
if (in_cgroup_ns && wants_force_mount) {
/*
* If cgroup namespaces are supported but the container
* will not have CAP_SYS_ADMIN after it has started we
* need to mount the cgroups manually.
*/
ret = cg_mount_in_cgroup_namespace(type, h, rootfs, dfd_mnt_cgroupfs, controller);
ret = cgroupfs_mount(cg_flags, h, rootfs, dfd_mnt_cgroupfs, controller);
if (ret < 0)
return false;
......@@ -2008,11 +2078,11 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
}
/* Here is where the ancient kernel section begins. */
ret = cg_mount_cgroup_full(type, h, rootfs, dfd_mnt_cgroupfs, controller);
ret = cgroupfs_bind_mount(cg_flags, h, rootfs, dfd_mnt_cgroupfs, controller);
if (ret < 0)
return false;
if (!cg_mount_needs_subdirs(type))
if (!cg_mount_needs_subdirs(cg_flags))
continue;
controllerpath = must_make_path(cgroup_root, controller, NULL);
......@@ -2024,7 +2094,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
if (ret < 0)
return false;
ret = cg_legacy_mount_controllers(type, h, controllerpath, path2, ops->container_cgroup);
ret = cg_legacy_mount_controllers(cg_flags, h, controllerpath, path2, ops->container_cgroup);
if (ret < 0)
return false;
}
......
......@@ -172,7 +172,7 @@ struct cgroup_ops {
bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
bool (*attach)(struct cgroup_ops *ops, const struct lxc_conf *conf,
const char *name, const char *lxcpath, pid_t pid);
bool (*mount)(struct cgroup_ops *ops, struct lxc_conf *conf, int type);
bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler, int type);
bool (*devices_activate)(struct cgroup_ops *ops,
struct lxc_handler *handler);
bool (*monitor_delegate_controllers)(struct cgroup_ops *ops);
......
......@@ -7,6 +7,7 @@
#include <stdio.h>
#include "compiler.h"
#include "file_utils.h"
/* Retrieve the cgroup version of a given entry from /proc/<pid>/mountinfo. */
__hidden extern int get_cgroup_version(char *line);
......@@ -32,4 +33,14 @@ __hidden extern int unified_cgroup_hierarchy(void);
__hidden extern int unified_cgroup_fd(int fd);
static inline bool cgns_supported(void)
{
static int supported = -1;
if (supported == -1)
supported = file_exists("/proc/self/ns/cgroup");
return supported == 1;
}
#endif /* __LXC_CGROUP_UTILS_H */
......@@ -564,7 +564,7 @@ static int add_shmount_to_list(struct lxc_conf *conf)
return add_elem_to_mount_list(new_mount, conf);
}
static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
{
int i, ret;
static struct {
......@@ -608,6 +608,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
{ 0, 0, NULL, NULL, NULL, 0, NULL, false }
};
struct lxc_conf *conf = handler->conf;
struct lxc_rootfs *rootfs = &conf->rootfs;
bool has_cap_net_admin;
......@@ -703,7 +704,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
if (flags & LXC_AUTO_CGROUP_FORCE)
cg_flags |= LXC_AUTO_CGROUP_FORCE;
if (!handler->cgroup_ops->mount(handler->cgroup_ops, conf, cg_flags))
if (!handler->cgroup_ops->mount(handler->cgroup_ops, handler, cg_flags))
return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
}
......@@ -3432,7 +3433,7 @@ int lxc_setup(struct lxc_handler *handler)
/* Do automatic mounts (mainly /proc and /sys), but exclude those that
* need to wait until other stuff has finished.
*/
ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
ret = lxc_mount_auto_mounts(handler, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK);
if (ret < 0)
return log_error(-1, "Failed to setup first automatic mounts");
......@@ -3473,7 +3474,7 @@ int lxc_setup(struct lxc_handler *handler)
* mounted. It is guaranteed to be mounted now either through
* automatically or via fstab entries.
*/
ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
ret = lxc_mount_auto_mounts(handler, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK);
if (ret < 0)
return log_error(-1, "Failed to setup remaining automatic mounts");
......
......@@ -241,7 +241,7 @@ enum {
LXC_AUTO_CGROUP_NOSPEC = 0x0B0, /* /sys/fs/cgroup (partial mount, r/w or mixed, depending on caps) */
LXC_AUTO_CGROUP_FULL_NOSPEC = 0x0E0, /* /sys/fs/cgroup (full mount, r/w or mixed, depending on caps) */
LXC_AUTO_CGROUP_FORCE = 0x100, /* mount cgroups even when cgroup namespaces are supported */
LXC_AUTO_CGROUP_MASK = 0x1F0, /* all known cgroup options, doe not contain LXC_AUTO_CGROUP_FORCE */
LXC_AUTO_CGROUP_MASK = 0x1F0, /* all known cgroup options */
LXC_AUTO_SHMOUNTS = 0x200, /* shared mount point */
LXC_AUTO_SHMOUNTS_MASK = 0x200, /* shared mount point mask */
......
......@@ -494,6 +494,13 @@ __lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \
__internal_ret__; \
})
#define syserrno(__ret__, format, ...) \
({ \
typeof(__ret__) __internal_ret__ = (__ret__); \
SYSERROR(format, ##__VA_ARGS__); \
__internal_ret__; \
})
#define log_error(__ret__, format, ...) \
({ \
typeof(__ret__) __internal_ret__ = (__ret__); \
......
......@@ -13,6 +13,7 @@
#include <unistd.h>
#include "caps.h"
#include "cgroups/cgroup_utils.h"
#include "conf.h"
#include "config.h"
#include "initutils.h"
......
......@@ -27,7 +27,8 @@
#include "af_unix.h"
#include "caps.h"
#include "cgroup.h"
#include "cgroups/cgroup.h"
#include "cgroups/cgroup_utils.h"
#include "commands.h"
#include "commands_utils.h"
#include "compiler.h"
......
......@@ -180,4 +180,10 @@ __hidden extern int __lxc_start(struct lxc_handler *, struct lxc_operations *, v
__hidden extern int resolve_clone_flags(struct lxc_handler *handler);
__hidden extern void lxc_expose_namespace_environment(const struct lxc_handler *handler);
static inline bool container_uses_namespace(const struct lxc_handler *handler,
unsigned int ns_flag)
{
return (handler->ns_clone_flags & ns_flag);
}
#endif
......@@ -747,11 +747,6 @@ char *on_path(const char *cmd, const char *rootfs)
return NULL;
}
bool cgns_supported(void)
{
return file_exists("/proc/self/ns/cgroup");
}
/* historically lxc-init has been under /usr/lib/lxc and under
* /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
*/
......@@ -1860,3 +1855,53 @@ bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res)
*res = base * mult;
return true;
}
int print_r(int fd, const char *path)
{
__do_close int dfd = -EBADF;
__do_closedir DIR *dir = NULL;
int ret = 0;
struct dirent *direntp;
struct stat st;
if (is_empty_string(path))
dfd = dup(fd);
else
dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY);
if (dfd < 0)
return -1;
dir = fdopendir(dfd);
if (!dir)
return -1;
/* Transfer ownership to fdopendir(). */
move_fd(dfd);
while ((direntp = readdir(dir))) {
if (!strcmp(direntp->d_name, ".") ||
!strcmp(direntp->d_name, ".."))
continue;
ret = fstatat(dfd, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
if (ret < 0 && errno != ENOENT)
break;
ret = 0;
if (S_ISDIR(st.st_mode))
ret = print_r(dfd, direntp->d_name);
else
INFO("mode(%o):uid(%d):gid(%d) -> %s/%s\n",
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, path,
direntp->d_name);
if (ret < 0 && errno != ENOENT)
break;
}
ret = fstatat(fd, path, &st, AT_SYMLINK_NOFOLLOW);
if (ret)
return -1;
else
INFO("mode(%o):uid(%d):gid(%d) -> %s",
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, path);
return ret;
}
......@@ -138,7 +138,6 @@ __hidden extern bool is_shared_mountpoint(const char *path);
__hidden extern int detect_shared_rootfs(void);
__hidden extern bool detect_ramfs_rootfs(void);
__hidden extern char *on_path(const char *cmd, const char *rootfs);
__hidden extern bool cgns_supported(void);
__hidden extern char *choose_init(const char *rootfs);
__hidden extern bool switch_to_ns(pid_t pid, const char *ns);
__hidden extern char *get_template_path(const char *t);
......@@ -244,5 +243,6 @@ __hidden extern int safe_mount_beneath(const char *beneath, const char *src, con
const char *fstype, unsigned int flags, const void *data);
__hidden extern int safe_mount_beneath_at(int beneat_fd, const char *src, const char *dst,
const char *fstype, unsigned int flags, const void *data);
__hidden __lxc_unused int print_r(int fd, const char *path);
#endif /* __LXC_UTILS_H */
......@@ -37,6 +37,7 @@
#include <sys/stat.h>
#include <sys/types.h>
#include "cgroups/cgroup_utils.h"
#include "lxctest.h"
#include "namespace.h"
#include "process_utils.h"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment