cgroups: add cgroup2 device controller support

Add a bpf-based device controller implementation. Signed-off-by: 's avatarChristian Brauner <christian.brauner@ubuntu.com>
parent a1734ac6
......@@ -366,6 +366,10 @@ AC_CHECK_TYPES([scmp_filter_ctx], [], [], [[#include <seccomp.h>]])
AC_CHECK_DECLS([seccomp_syscall_resolve_name_arch], [], [], [[#include <seccomp.h>]])
CFLAGS="$OLD_CFLAGS"
AC_CHECK_HEADERS([linux/bpf.h], [
AC_CHECK_TYPES([struct bpf_cgroup_dev_ctx], [], [], [[#include <linux/bpf.h>]])
], [], [])
# Configuration examples
AC_ARG_ENABLE([examples],
[AS_HELP_STRING([--enable-examples], [install examples [default=yes]])],
......
......@@ -6,6 +6,7 @@ noinst_HEADERS = attach.h \
caps.h \
cgroups/cgroup.h \
cgroups/cgroup_utils.h \
cgroups/cgroup2_devices.h \
compiler.h \
conf.h \
confile.h \
......@@ -93,6 +94,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \
caps.c caps.h \
cgroups/cgfsng.c \
cgroups/cgroup.c cgroups/cgroup.h \
cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
compiler.h \
commands.c commands.h \
......
......@@ -54,6 +54,7 @@
#include "caps.h"
#include "cgroup.h"
#include "cgroup2_devices.h"
#include "cgroup_utils.h"
#include "commands.h"
#include "conf.h"
......@@ -1105,6 +1106,12 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
wrap.hierarchies = ops->hierarchies;
wrap.conf = handler->conf;
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
if (ret < 0)
WARN("Failed to detach bpf program from cgroup");
#endif
if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
"cgroup_rmdir_wrapper");
......@@ -2474,8 +2481,146 @@ out:
return ret;
}
/*
* Some of the parsing logic comes from the original cgroup device v1
* implementation in the kernel.
*/
static int bpf_device_cgroup_prepare(struct lxc_conf *conf, const char *key,
const char *val)
{
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
struct device_item {
char type;
int major;
int minor;
char access[100];
int allow;
} device_item = {0};
int count, ret;
char temp[50];
struct bpf_program *device;
if (conf->cgroup2_devices) {
device = conf->cgroup2_devices;
} else {
device = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
if (device && bpf_program_init(device)) {
ERROR("Failed to initialize bpf program");
return -1;
}
}
if (!device) {
ERROR("Failed to create new ebpf device program");
return -1;
}
conf->cgroup2_devices = device;
if (strcmp("devices.allow", key) == 0)
device_item.allow = 1;
if (strcmp(val, "a") == 0) {
device->blacklist = (device_item.allow == 1);
return 0;
}
switch (*val) {
case 'a':
__fallthrough;
case 'b':
__fallthrough;
case 'c':
device_item.type = *val;
break;
default:
return -1;
}
val++;
if (!isspace(*val))
return -1;
val++;
if (*val == '*') {
device_item.major = ~0;
val++;
} else if (isdigit(*val)) {
memset(temp, 0, sizeof(temp));
for (count = 0; count < sizeof(temp) - 1; count++) {
temp[count] = *val;
val++;
if (!isdigit(*val))
break;
}
ret = lxc_safe_uint(temp, &device_item.major);
if (ret)
return -1;
} else {
return -1;
}
if (*val != ':')
return -1;
val++;
/* read minor */
if (*val == '*') {
device_item.minor = ~0;
val++;
} else if (isdigit(*val)) {
memset(temp, 0, sizeof(temp));
for (count = 0; count < sizeof(temp) - 1; count++) {
temp[count] = *val;
val++;
if (!isdigit(*val))
break;
}
ret = lxc_safe_uint(temp, &device_item.minor);
if (ret)
return -1;
} else {
return -1;
}
if (!isspace(*val))
return -1;
for (val++, count = 0; count < 3; count++, val++) {
switch (*val) {
case 'r':
device_item.access[count] = *val;
break;
case 'w':
device_item.access[count] = *val;
break;
case 'm':
device_item.access[count] = *val;
break;
case '\n':
case '\0':
count = 3;
break;
default:
return -1;
}
}
ret = bpf_program_append_device(device, device_item.type, device_item.major,
device_item.minor, device_item.access,
device_item.allow);
if (ret) {
ERROR("Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d",
device_item.type, device_item.major, device_item.minor,
device_item.access, device_item.allow);
return -1;
} else {
TRACE("Added new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d",
device_item.type, device_item.major, device_item.minor,
device_item.access, device_item.allow);
}
#endif
return 0;
}
static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
struct lxc_list *cgroup_settings)
struct lxc_list *cgroup_settings,
struct lxc_conf *conf)
{
struct lxc_list *iterator;
struct hierarchy *h = ops->unified;
......@@ -2486,17 +2631,24 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
if (!h)
return false;
lxc_list_for_each(iterator, cgroup_settings) {
lxc_list_for_each (iterator, cgroup_settings) {
__do_free char *fullpath = NULL;
int ret;
struct lxc_cgroup *cg = iterator->elem;
fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
if (ret < 0) {
SYSERROR("Failed to set \"%s\" to \"%s\"",
cg->subsystem, cg->value);
return false;
if (strncmp("devices", cg->subsystem, 7) == 0) {
ret = bpf_device_cgroup_prepare(conf, cg->subsystem,
cg->value);
} else {
fullpath = must_make_path(h->container_full_path,
cg->subsystem, NULL);
ret = lxc_write_to_file(fullpath, cg->value,
strlen(cg->value), false, 0666);
if (ret < 0) {
SYSERROR("Failed to set \"%s\" to \"%s\"",
cg->subsystem, cg->value);
return false;
}
}
TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
}
......@@ -2505,6 +2657,32 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
return true;
}
__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
struct lxc_handler *handler)
{
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
int ret;
struct hierarchy *h = ops->unified;
struct bpf_program *device = handler->conf->cgroup2_devices;
if (!h)
return false;
if (!device)
return true;
ret = bpf_program_finalize(device);
if (ret)
return false;
return bpf_program_cgroup_attach(device, BPF_CGROUP_DEVICE,
h->container_full_path,
BPF_F_ALLOW_MULTI) == 0;
#else
return true;
#endif
}
__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
struct lxc_conf *conf,
bool do_devices)
......@@ -2512,7 +2690,11 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
return false;
return __cg_unified_setup_limits(ops, &conf->cgroup2);
/* for v2 we will have already set up devices */
if (do_devices)
return true;
return __cg_unified_setup_limits(ops, &conf->cgroup2, conf);
}
static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
......@@ -2893,6 +3075,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
cgfsng_ops->chown = cgfsng_chown;
cgfsng_ops->mount = cgfsng_mount;
cgfsng_ops->nrtasks = cgfsng_nrtasks;
cgfsng_ops->devices_activate = cgfsng_devices_activate;
return move_ptr(cgfsng_ops);
}
......@@ -164,6 +164,8 @@ struct cgroup_ops {
bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
const char *root, int type);
int (*nrtasks)(struct cgroup_ops *ops);
bool (*devices_activate)(struct cgroup_ops *ops,
struct lxc_handler *handler);
};
extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf);
......
/* SPDX-License-Identifier: LGPL-2.1+ */
/* Parts of this taken from systemd's implementation. */
#ifndef __LXC_CGROUP2_DEVICES_H
#define __LXC_CGROUP2_DEVICES_H
#include <fcntl.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include "conf.h"
#include "config.h"
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
#include <linux/bpf.h>
#include <linux/filter.h>
#endif
#if !HAVE_BPF
#if !(defined __NR_bpf && __NR_bpf > 0)
#if defined __NR_bpf
#undef __NR_bpf
#endif
#if defined __i386__
#define __NR_bpf 357
#elif defined __x86_64__
#define __NR_bpf 321
#elif defined __aarch64__
#define __NR_bpf 280
#elif defined __arm__
#define __NR_bpf 386
#elif defined __sparc__
#define __NR_bpf 349
#elif defined __s390__
#define __NR_bpf 351
#elif defined __tilegx__
#define __NR_bpf 280
#else
#warning "__NR_bpf not defined for your architecture"
#endif
#endif
union bpf_attr;
static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
{
#ifdef __NR_bpf
return (int)syscall(__NR_bpf, cmd, attr, size);
#else
errno = ENOSYS;
return -1;
#endif
}
#define bpf missing_bpf
#endif
struct bpf_program {
bool blacklist;
int kernel_fd;
uint32_t prog_type;
size_t n_instructions;
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
struct bpf_insn *instructions;
#endif
char *attached_path;
int attached_type;
uint32_t attached_flags;
};
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
struct bpf_program *bpf_program_new(uint32_t prog_type);
int bpf_program_init(struct bpf_program *prog);
int bpf_program_append_device(struct bpf_program *prog, char type, int major,
int minor, const char *access, int allow);
int bpf_program_finalize(struct bpf_program *prog);
int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
const char *path, uint32_t flags);
int bpf_program_cgroup_detach(struct bpf_program *prog);
void bpf_program_free(struct bpf_program *prog);
void lxc_clear_cgroup2_devices(struct lxc_conf *conf);
static inline void __do_bpf_program_free(struct bpf_program **prog)
{
if (*prog) {
bpf_program_free(*prog);
*prog = NULL;
}
}
#else
static inline struct bpf_program *bpf_program_new(uint32_t prog_type)
{
return NULL;
}
static inline int bpf_program_init(struct bpf_program *prog)
{
return -ENOSYS;
}
static inline int bpf_program_append_device(struct bpf_program *prog, char type,
int major, int minor,
const char *access, int allow)
{
return -ENOSYS;
}
static inline int bpf_program_finalize(struct bpf_program *prog)
{
return -ENOSYS;
}
static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
const char *path, uint32_t flags)
{
return -ENOSYS;
}
static inline int bpf_program_cgroup_detach(struct bpf_program *prog)
{
return -ENOSYS;
}
static inline void bpf_program_free(struct bpf_program *prog)
{
}
static inline void lxc_clear_cgroup2_devices(struct lxc_conf *conf)
{
}
static inline void __do_bpf_program_free(struct bpf_program **prog)
{
}
#endif
#endif /* __LXC_CGROUP2_DEVICES_H */
......@@ -57,6 +57,7 @@
#include "af_unix.h"
#include "caps.h"
#include "cgroup.h"
#include "cgroup2_devices.h"
#include "conf.h"
#include "config.h"
#include "confile.h"
......@@ -4044,6 +4045,7 @@ void lxc_conf_free(struct lxc_conf *conf)
lxc_clear_config_keepcaps(conf);
lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
lxc_clear_cgroup2_devices(conf);
lxc_clear_hooks(conf, "lxc.hook");
lxc_clear_mount_entries(conf);
lxc_clear_idmaps(conf);
......
......@@ -234,6 +234,7 @@ struct lxc_conf {
struct {
struct lxc_list cgroup;
struct lxc_list cgroup2;
struct bpf_program *cgroup2_devices;
};
struct {
......
......@@ -505,11 +505,11 @@ ATTR_UNUSED static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \
} while (0)
#endif
#define error_log_errno(__errno__, format, ...) \
({ \
errno = __errno__; \
SYSERROR(format, ##__VA_ARGS__); \
-1; \
#define error_log_errno(__errno__, format, ...) \
({ \
errno = __errno__; \
SYSERROR(format, ##__VA_ARGS__); \
-1; \
})
extern int lxc_log_fd;
......
......@@ -389,6 +389,8 @@ enum {
#define PTR_TO_INTMAX(p) ((intmax_t)((intptr_t)(p)))
#define INTMAX_TO_PTR(u) ((void *)((intptr_t)(u)))
#define PTR_TO_UINT64(p) ((uint64_t)((intptr_t)(p)))
#define LXC_INVALID_UID ((uid_t)-1)
#define LXC_INVALID_GID ((gid_t)-1)
......@@ -425,4 +427,80 @@ enum {
#define LXC_TIMESTAMP_FNAME "ts"
#define LXC_COMMENT_FNAME "comment"
/* Taken from systemd. */
#define free_and_replace(a, b) \
({ \
free(a); \
(a) = (b); \
(b) = NULL; \
0; \
})
#define XCONCATENATE(x, y) x##y
#define CONCATENATE(x, y) XCONCATENATE(x, y)
#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq))
#define UNIQ __COUNTER__
#undef MIN
#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b))
#define __MIN(aq, a, bq, b) \
({ \
const typeof(a) UNIQ_T(A, aq) = (a); \
const typeof(b) UNIQ_T(B, bq) = (b); \
UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \
})
/* Taken from the kernel. */
/*
* min()/max()/clamp() macros must accomplish three things:
*
* - avoid multiple evaluations of the arguments (so side-effects like
* "x++" happen only once) when non-constant.
* - perform strict type-checking (to generate warnings instead of
* nasty runtime surprises). See the "unnecessary" pointer comparison
* in __typecheck().
* - retain result as a constant expressions when called with only
* constant expressions (to avoid tripping VLA warnings in stack
* allocation usage).
*/
#define __typecheck(x, y) (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
/*
* This returns a constant expression while determining if an argument is
* a constant expression, most importantly without evaluating the argument.
* Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
*/
#define __is_constexpr(x) \
(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x)*0l)) : (int *)8)))
#define __no_side_effects(x, y) (__is_constexpr(x) && __is_constexpr(y))
#define __safe_cmp(x, y) (__typecheck(x, y) && __no_side_effects(x, y))
#define __cmp(x, y, op) ((x)op(y) ? (x) : (y))
#define __cmp_once(x, y, unique_x, unique_y, op) \
({ \
typeof(x) unique_x = (x); \
typeof(y) unique_y = (y); \
__cmp(unique_x, unique_y, op); \
})
#define __careful_cmp(x, y, op) \
__builtin_choose_expr(__safe_cmp(x, y), __cmp(x, y, op), \
__cmp_once(x, y, __UNIQUE_ID(__x), \
__UNIQUE_ID(__y), op))
/**
* min - return minimum of two values of the same or compatible types
* @x: first value
* @y: second value
*/
#define min(x, y) __careful_cmp(x, y, <)
#define ARRAY_SIZE(x) \
(__builtin_choose_expr(!__builtin_types_compatible_p(typeof(x), \
typeof(&*(x))), \
sizeof(x) / sizeof((x)[0]), ((void)0)))
#endif /* __LXC_MACRO_H */
......@@ -1912,6 +1912,12 @@ static int lxc_spawn(struct lxc_handler *handler)
}
TRACE("Set up legacy device cgroup controller limits");
if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
ERROR("Failed to setup cgroup2 device controller limits");
goto out_delete_net;
}
TRACE("Set up cgroup2 device controller limits");
if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "cgroup");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment