Unverified Commit 35a68d6d by Stéphane Graber Committed by GitHub

Merge pull request #3681 from brauner/2021-02-18/cgroups

cgroups: fixes & bpf rework
parents 599a0c6c ad755295
......@@ -385,10 +385,6 @@ AM_COND_IF([ENABLE_CAP],
AC_CHECK_LIB(cap,cap_get_file, AC_DEFINE(LIBCAP_SUPPORTS_FILE_CAPABILITIES,1,[Have cap_get_file]),[],[])
AC_SUBST([CAP_LIBS], [-lcap])])
AC_CHECK_HEADERS([linux/bpf.h], [
AC_CHECK_TYPES([struct bpf_cgroup_dev_ctx], [], [], [[#include <linux/bpf.h>]])
], [], [])
# Configuration examples
AC_ARG_ENABLE([examples],
[AS_HELP_STRING([--enable-examples], [install examples [default=yes]])],
......
This source diff could not be displayed because it is too large. You can view the blob instead.
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI__LINUX_BPF_COMMON_H__
#define _UAPI__LINUX_BPF_COMMON_H__
/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define BPF_LD 0x00
#define BPF_LDX 0x01
#define BPF_ST 0x02
#define BPF_STX 0x03
#define BPF_ALU 0x04
#define BPF_JMP 0x05
#define BPF_RET 0x06
#define BPF_MISC 0x07
/* ld/ldx fields */
#define BPF_SIZE(code) ((code) & 0x18)
#define BPF_W 0x00 /* 32-bit */
#define BPF_H 0x08 /* 16-bit */
#define BPF_B 0x10 /* 8-bit */
/* eBPF BPF_DW 0x18 64-bit */
#define BPF_MODE(code) ((code) & 0xe0)
#define BPF_IMM 0x00
#define BPF_ABS 0x20
#define BPF_IND 0x40
#define BPF_MEM 0x60
#define BPF_LEN 0x80
#define BPF_MSH 0xa0
/* alu/jmp fields */
#define BPF_OP(code) ((code) & 0xf0)
#define BPF_ADD 0x00
#define BPF_SUB 0x10
#define BPF_MUL 0x20
#define BPF_DIV 0x30
#define BPF_OR 0x40
#define BPF_AND 0x50
#define BPF_LSH 0x60
#define BPF_RSH 0x70
#define BPF_NEG 0x80
#define BPF_MOD 0x90
#define BPF_XOR 0xa0
#define BPF_JA 0x00
#define BPF_JEQ 0x10
#define BPF_JGT 0x20
#define BPF_JGE 0x30
#define BPF_JSET 0x40
#define BPF_SRC(code) ((code) & 0x08)
#define BPF_K 0x00
#define BPF_X 0x08
#ifndef BPF_MAXINSNS
#define BPF_MAXINSNS 4096
#endif
#endif /* _UAPI__LINUX_BPF_COMMON_H__ */
......@@ -6,6 +6,8 @@ pkginclude_HEADERS = attach_options.h \
noinst_HEADERS = api_extensions.h \
attach.h \
../include/bpf.h \
../include/bpf_common.h \
caps.h \
cgroups/cgroup.h \
cgroups/cgroup_utils.h \
......@@ -99,6 +101,8 @@ lib_LTLIBRARIES = liblxc.la
liblxc_la_SOURCES = af_unix.c af_unix.h \
api_extensions.h \
attach.c attach.h \
../include/bpf.h \
../include/bpf_common.h \
caps.c caps.h \
cgroups/cgfsng.c \
cgroups/cgroup.c cgroups/cgroup.h \
......
......@@ -34,9 +34,7 @@ static char *api_extensions[] = {
"network_gateway_device_route",
"network_phys_macvlan_mtu",
"network_veth_router",
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
"cgroup2_devices",
#endif
"cgroup2",
"pidfd",
"cgroup_advanced_isolation",
......
......@@ -14,6 +14,7 @@
#include "conf.h"
#include "config.h"
#include "initutils.h"
#include "memory_utils.h"
#include "log.h"
#include "start.h"
#include "string_utils.h"
......@@ -68,14 +69,9 @@ void cgroup_exit(struct cgroup_ops *ops)
free(ops->cgroup_pattern);
free(ops->monitor_cgroup);
{
if (ops->container_cgroup != ops->container_limit_cgroup)
free(ops->container_limit_cgroup);
free(ops->container_cgroup);
}
free_equal(ops->container_cgroup, ops->container_limit_cgroup);
if (ops->cgroup2_devices)
bpf_program_free(ops->cgroup2_devices);
bpf_device_program_free(ops);
if (ops->dfd_mnt_cgroupfs_host >= 0)
close(ops->dfd_mnt_cgroupfs_host);
......@@ -92,32 +88,15 @@ void cgroup_exit(struct cgroup_ops *ops)
free((*it)->mountpoint);
free((*it)->container_base_path);
{
free((*it)->container_full_path);
if ((*it)->container_full_path != (*it)->container_limit_path)
free((*it)->monitor_full_path);
}
{
if ((*it)->cgfd_limit >= 0 && (*it)->cgfd_con != (*it)->cgfd_limit)
close((*it)->cgfd_limit);
free_equal((*it)->container_full_path,
(*it)->container_limit_path);
if ((*it)->cgfd_con >= 0)
close((*it)->cgfd_con);
}
close_equal((*it)->cgfd_con, (*it)->cgfd_limit);
if ((*it)->cgfd_mon >= 0)
close((*it)->cgfd_mon);
{
if ((*it)->dfd_base >= 0 && (*it)->dfd_mnt != (*it)->dfd_base)
close((*it)->dfd_base);
if ((*it)->dfd_mnt >= 0)
close((*it)->dfd_mnt);
}
close_equal((*it)->dfd_base, (*it)->dfd_mnt);
free(*it);
}
......
......@@ -62,9 +62,6 @@ typedef enum {
* - The full path to the container's limiting cgroup. May simply point to
* container_full_path.
*
* @monitor_full_path
* - The full path to the monitor's cgroup.
*
* @version
* - legacy hierarchy
* If the hierarchy is a legacy hierarchy this will be set to
......@@ -84,7 +81,6 @@ struct hierarchy {
char *container_base_path;
char *container_full_path;
char *container_limit_path;
char *monitor_full_path;
int version;
/* cgroup2 only */
......@@ -101,7 +97,7 @@ struct hierarchy {
*/
int cgfd_limit;
/* File descriptor for the monitor's cgroup @monitor_full_path. */
/* File descriptor for the monitor's cgroup. */
int cgfd_mon;
/* File descriptor for the controller's mountpoint @mountpoint. */
......
......@@ -19,117 +19,94 @@
#include "compiler.h"
#include "conf.h"
#include "config.h"
#include "list.h"
#include "macro.h"
#include "memory_utils.h"
#include "syscall_numbers.h"
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
#include <linux/bpf.h>
#include <linux/filter.h>
#endif
#include "include/bpf.h"
#include "include/bpf_common.h"
#ifndef HAVE_BPF
union bpf_attr;
static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
static inline int bpf_lxc(int cmd, union bpf_attr *attr, size_t size)
{
return syscall(__NR_bpf, cmd, attr, size);
}
#define bpf missing_bpf
#define bpf bpf_lxc
#endif /* HAVE_BPF */
struct bpf_program {
int device_list_type;
int kernel_fd;
uint32_t prog_type;
__u32 prog_type;
size_t n_instructions;
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
struct bpf_insn *instructions;
#endif /* HAVE_STRUCT_BPF_CGROUP_DEV_CTX */
char *attached_path;
int fd_cgroup;
int attached_type;
uint32_t attached_flags;
__u32 attached_flags;
};
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
__hidden extern struct bpf_program *bpf_program_new(uint32_t prog_type);
__hidden extern int bpf_program_init(struct bpf_program *prog);
__hidden extern int bpf_program_append_device(struct bpf_program *prog, struct device_item *device);
__hidden extern int bpf_program_finalize(struct bpf_program *prog);
__hidden extern int bpf_program_cgroup_attach(struct bpf_program *prog, int type, const char *path,
uint32_t flags);
__hidden extern int bpf_program_cgroup_detach(struct bpf_program *prog);
__hidden extern void bpf_program_free(struct bpf_program *prog);
__hidden extern void bpf_device_program_free(struct cgroup_ops *ops);
__hidden extern bool bpf_devices_cgroup_supported(void);
__hidden extern int bpf_list_add_device(struct lxc_conf *conf, struct device_item *device);
#else /* !HAVE_STRUCT_BPF_CGROUP_DEV_CTX */
static inline struct bpf_program *bpf_program_new(uint32_t prog_type)
static inline bool bpf_device_block_all(const struct bpf_program *prog)
{
errno = ENOSYS;
return NULL;
/* LXC_BPF_DEVICE_CGROUP_ALLOWLIST -> allowlist (deny all) */
return prog->device_list_type == LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
}
static inline int bpf_program_init(struct bpf_program *prog)
static inline bool bpf_device_add(const struct bpf_program *prog,
struct device_item *device)
{
errno = ENOSYS;
return -1;
}
if (device->global_rule > LXC_BPF_DEVICE_CGROUP_LOCAL_RULE)
return false;
static inline int bpf_program_append_device(struct bpf_program *prog, char type,
int major, int minor,
const char *access, int allow)
{
errno = ENOSYS;
return -1;
}
/* We're blocking all devices so skip individual deny rules. */
if (bpf_device_block_all(prog) && !device->allow)
return false;
static inline int bpf_program_finalize(struct bpf_program *prog)
{
errno = ENOSYS;
return -1;
/* We're allowing all devices so skip individual allow rules. */
if (!bpf_device_block_all(prog) && device->allow)
return false;
return true;
}
static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
const char *path, uint32_t flags)
static inline void bpf_device_set_type(struct bpf_program *prog,
struct lxc_list *devices)
{
errno = ENOSYS;
return -1;
}
struct lxc_list *it;
static inline int bpf_program_cgroup_detach(struct bpf_program *prog)
{
errno = ENOSYS;
return -1;
}
lxc_list_for_each (it, devices) {
struct device_item *cur = it->elem;
static inline void bpf_program_free(struct bpf_program *prog)
{
if (cur->global_rule > LXC_BPF_DEVICE_CGROUP_LOCAL_RULE)
prog->device_list_type = cur->global_rule;
}
}
static inline void bpf_device_program_free(struct cgroup_ops *ops)
{
}
__hidden extern struct bpf_program *bpf_program_new(__u32 prog_type);
__hidden extern int bpf_program_init(struct bpf_program *prog);
__hidden extern int bpf_program_append_device(struct bpf_program *prog, struct device_item *device);
__hidden extern int bpf_program_finalize(struct bpf_program *prog);
__hidden extern int bpf_program_cgroup_detach(struct bpf_program *prog);
__hidden extern void bpf_device_program_free(struct cgroup_ops *ops);
__hidden extern bool bpf_devices_cgroup_supported(void);
static inline bool bpf_devices_cgroup_supported(void)
{
return false;
}
__hidden extern int bpf_list_add_device(struct lxc_list *devices,
struct device_item *device);
__hidden extern bool bpf_cgroup_devices_attach(struct cgroup_ops *ops,
struct lxc_list *devices);
__hidden extern bool bpf_cgroup_devices_update(struct cgroup_ops *ops,
struct device_item *new,
struct lxc_list *devices);
static inline int bpf_list_add_device(struct lxc_conf *conf,
struct device_item *device)
static inline void bpf_program_free(struct bpf_program *prog)
{
errno = ENOSYS;
return -1;
if (prog) {
(void)bpf_program_cgroup_detach(prog);
free(prog->instructions);
free(prog);
}
}
#endif /* !HAVE_STRUCT_BPF_CGROUP_DEV_CTX */
define_cleanup_function(struct bpf_program *, bpf_program_free);
#define __do_bpf_program_free call_cleaner(bpf_program_free)
......
......@@ -1168,7 +1168,6 @@ static int lxc_cmd_add_state_client_callback(__owns int fd, struct lxc_cmd_req *
int lxc_cmd_add_bpf_device_cgroup(const char *name, const char *lxcpath,
struct device_item *device)
{
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
int stopped = 0;
struct lxc_cmd_rr cmd = {
.req = {
......@@ -1188,25 +1187,16 @@ int lxc_cmd_add_bpf_device_cgroup(const char *name, const char *lxcpath,
return log_error_errno(-1, errno, "Failed to add new bpf device cgroup rule");
return 0;
#else
return ret_set_errno(-1, ENOSYS);
#endif
}
static int lxc_cmd_add_bpf_device_cgroup_callback(int fd, struct lxc_cmd_req *req,
struct lxc_handler *handler,
struct lxc_epoll_descr *descr)
{
#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
__do_bpf_program_free struct bpf_program *devices = NULL;
struct lxc_cmd_rsp rsp = {0};
struct lxc_conf *conf = handler->conf;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
struct hierarchy *unified = cgroup_ops->unified;
int ret;
struct lxc_list *it;
struct lxc_cmd_rsp rsp = {};
struct device_item *device;
struct bpf_program *devices_old;
struct lxc_conf *conf;
if (req->datalen <= 0)
return LXC_CMD_REAP_CLIENT_FD;
......@@ -1216,58 +1206,19 @@ static int lxc_cmd_add_bpf_device_cgroup_callback(int fd, struct lxc_cmd_req *re
if (!req->data)
return LXC_CMD_REAP_CLIENT_FD;
device = (struct device_item *)req->data;
rsp.ret = -1;
if (!unified)
goto respond;
ret = bpf_list_add_device(conf, device);
if (ret < 0)
goto respond;
devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
if (!devices)
goto respond;
ret = bpf_program_init(devices);
if (ret)
goto respond;
lxc_list_for_each(it, &conf->devices) {
struct device_item *cur = it->elem;
ret = bpf_program_append_device(devices, cur);
if (ret)
goto respond;
}
ret = bpf_program_finalize(devices);
if (ret)
goto respond;
ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
unified->container_full_path,
BPF_F_ALLOW_MULTI);
if (ret)
goto respond;
/* Replace old bpf program. */
devices_old = move_ptr(cgroup_ops->cgroup2_devices);
cgroup_ops->cgroup2_devices = move_ptr(devices);
devices = move_ptr(devices_old);
rsp.ret = 0;
device = (struct device_item *)req->data;
conf = handler->conf;
if (!bpf_cgroup_devices_update(handler->cgroup_ops, device, &conf->devices))
rsp.ret = -1;
else
rsp.ret = 0;
respond:
ret = lxc_cmd_rsp_send(fd, &rsp);
if (ret < 0)
return LXC_CMD_REAP_CLIENT_FD;
return 0;
#else
return ret_set_errno(-1, ENOSYS);
#endif
}
int lxc_cmd_console_log(const char *name, const char *lxcpath,
......
......@@ -19,7 +19,9 @@
#endif
#endif
#ifndef __fallthrough
#if __GNUC__ >= 7
#define __fallthrough __attribute__((__fallthrough__))
#else
#define __fallthrough /* fall through */
#endif
......
......@@ -269,11 +269,11 @@ struct lxc_state_client {
lxc_state_t states[MAX_STATE];
};
enum {
LXC_BPF_DEVICE_CGROUP_LOCAL_RULE = -1,
LXC_BPF_DEVICE_CGROUP_ALLOWLIST = 0,
LXC_BPF_DEVICE_CGROUP_DENYLIST = 1,
};
typedef enum lxc_bpf_devices_rule_t {
LXC_BPF_DEVICE_CGROUP_LOCAL_RULE = -1,
LXC_BPF_DEVICE_CGROUP_ALLOWLIST = 0,
LXC_BPF_DEVICE_CGROUP_DENYLIST = 1,
} lxc_bpf_devices_rule_t;
struct device_item {
char type;
......
......@@ -2040,8 +2040,8 @@ static bool do_lxcapi_reboot2(struct lxc_container *c, int timeout)
else
killret = kill(pid, rebootsignal);
if (killret < 0)
return log_warn(false, "Failed to send signal %d to pid %d", rebootsignal, pid);
TRACE("Sent signal %d to pid %d", rebootsignal, pid);
return log_warn(false, "Failed to send signal %d to pidfd(%d)/pid(%d)", rebootsignal, pidfd, pid);
TRACE("Sent signal %d to pidfd(%d)/pid(%d)", rebootsignal, pidfd, pid);
if (timeout == 0)
return true;
......
......@@ -694,4 +694,11 @@ enum {
#define MAX_FILENO ~0U
#define swap(a, b) \
do { \
typeof(a) __tmp = (a); \
(a) = (b); \
(b) = __tmp; \
} while (0)
#endif /* __LXC_MACRO_H */
......@@ -95,4 +95,21 @@ static inline void *memdup(const void *data, size_t len)
(a) = move_fd((b)); \
})
#define close_equal(a, b) \
({ \
if (a >= 0 && a != b) \
close(a); \
if (close >= 0) \
close(b); \
a = b = -EBADF; \
})
#define free_equal(a, b) \
({ \
if (a != b) \
free(a); \
free(b); \
a = b = NULL; \
})
#endif /* __LXC_MEMORY_UTILS_H */
......@@ -1858,19 +1858,34 @@ bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res)
int print_r(int fd, const char *path)
{
__do_close int dfd = -EBADF;
__do_close int dfd = -EBADF, dfd_dup = -EBADF;
__do_closedir DIR *dir = NULL;
int ret = 0;
struct dirent *direntp;
struct stat st;
if (is_empty_string(path))
dfd = dup(fd);
else
dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY);
if (is_empty_string(path)) {
char buf[LXC_PROC_SELF_FD_LEN];
ret = strnprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
if (ret < 0)
return ret_errno(EIO);
/*
* O_PATH file descriptors can't be used so we need to re-open
* just in case.
*/
dfd = openat(-EBADF, buf, O_CLOEXEC | O_DIRECTORY, 0);
} else {
dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY, 0);
}
if (dfd < 0)
return -1;
dfd_dup = dup_cloexec(dfd);
if (dfd_dup < 0)
return -1;
dir = fdopendir(dfd);
if (!dir)
return -1;
......@@ -1882,26 +1897,29 @@ int print_r(int fd, const char *path)
!strcmp(direntp->d_name, ".."))
continue;
ret = fstatat(dfd, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
ret = fstatat(dfd_dup, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
if (ret < 0 && errno != ENOENT)
break;
ret = 0;
if (S_ISDIR(st.st_mode))
ret = print_r(dfd, direntp->d_name);
ret = print_r(dfd_dup, direntp->d_name);
else
INFO("mode(%o):uid(%d):gid(%d) -> %s/%s\n",
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, path,
INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n",
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, dfd_dup,
direntp->d_name);
if (ret < 0 && errno != ENOENT)
break;
}
ret = fstatat(fd, path, &st, AT_SYMLINK_NOFOLLOW);
if (is_empty_string(path))
ret = fstatat(fd, "", &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH);
else
ret = fstatat(fd, path, &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW);
if (ret)
return -1;
else
INFO("mode(%o):uid(%d):gid(%d) -> %s",
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, path);
(st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, maybe_empty(path));
return ret;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment