Unverified Commit b6f48637 by Stéphane Graber Committed by GitHub

Merge pull request #3689 from brauner/2021-02-21/fixes

cgroups: introduce fd-only cgroup attach via LXC_CMD_GET_CGROUP_CTX
parents 79399658 3a6678c7
...@@ -112,22 +112,18 @@ int lxc_abstract_unix_connect(const char *path) ...@@ -112,22 +112,18 @@ int lxc_abstract_unix_connect(const char *path)
return move_fd(fd); return move_fd(fd);
} }
int lxc_abstract_unix_send_fds_iov(int fd, int *sendfds, int num_sendfds, int lxc_abstract_unix_send_fds_iov(int fd, const int *sendfds, int num_sendfds,
struct iovec *iov, size_t iovlen) struct iovec *iov, size_t iovlen)
{ {
__do_free char *cmsgbuf = NULL; __do_free char *cmsgbuf = NULL;
int ret; int ret;
struct msghdr msg; struct msghdr msg = {};
struct cmsghdr *cmsg = NULL; struct cmsghdr *cmsg = NULL;
size_t cmsgbufsize = CMSG_SPACE(num_sendfds * sizeof(int)); size_t cmsgbufsize = CMSG_SPACE(num_sendfds * sizeof(int));
memset(&msg, 0, sizeof(msg));
cmsgbuf = malloc(cmsgbufsize); cmsgbuf = malloc(cmsgbufsize);
if (!cmsgbuf) { if (!cmsgbuf)
errno = ENOMEM; return ret_errno(-ENOMEM);
return -1;
}
msg.msg_control = cmsgbuf; msg.msg_control = cmsgbuf;
msg.msg_controllen = cmsgbufsize; msg.msg_controllen = cmsgbufsize;
...@@ -151,10 +147,10 @@ int lxc_abstract_unix_send_fds_iov(int fd, int *sendfds, int num_sendfds, ...@@ -151,10 +147,10 @@ int lxc_abstract_unix_send_fds_iov(int fd, int *sendfds, int num_sendfds,
return ret; return ret;
} }
int lxc_abstract_unix_send_fds(int fd, int *sendfds, int num_sendfds, int lxc_abstract_unix_send_fds(int fd, const int *sendfds, int num_sendfds,
void *data, size_t size) void *data, size_t size)
{ {
char buf[1] = {0}; char buf[1] = {};
struct iovec iov = { struct iovec iov = {
.iov_base = data ? data : buf, .iov_base = data ? data : buf,
.iov_len = data ? size : sizeof(buf), .iov_len = data ? size : sizeof(buf),
...@@ -168,60 +164,174 @@ int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, ...@@ -168,60 +164,174 @@ int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data,
return lxc_abstract_unix_send_fds(fd, sendfds, num_sendfds, data, size); return lxc_abstract_unix_send_fds(fd, sendfds, num_sendfds, data, size);
} }
static int lxc_abstract_unix_recv_fds_iov(int fd, int *recvfds, int num_recvfds, static ssize_t lxc_abstract_unix_recv_fds_iov(int fd,
struct iovec *iov, size_t iovlen) struct unix_fds *ret_fds,
struct iovec *ret_iov,
size_t size_ret_iov)
{ {
__do_free char *cmsgbuf = NULL; __do_free char *cmsgbuf = NULL;
int ret; ssize_t ret;
struct msghdr msg; struct msghdr msg = {};
struct cmsghdr *cmsg = NULL;
size_t cmsgbufsize = CMSG_SPACE(sizeof(struct ucred)) + size_t cmsgbufsize = CMSG_SPACE(sizeof(struct ucred)) +
CMSG_SPACE(num_recvfds * sizeof(int)); CMSG_SPACE(ret_fds->fd_count_max * sizeof(int));
memset(&msg, 0, sizeof(msg));
cmsgbuf = malloc(cmsgbufsize); cmsgbuf = zalloc(cmsgbufsize);
if (!cmsgbuf) if (!cmsgbuf)
return ret_errno(ENOMEM); return ret_errno(ENOMEM);
msg.msg_control = cmsgbuf; msg.msg_control = cmsgbuf;
msg.msg_controllen = cmsgbufsize; msg.msg_controllen = cmsgbufsize;
msg.msg_iov = iov; msg.msg_iov = ret_iov;
msg.msg_iovlen = iovlen; msg.msg_iovlen = size_ret_iov;
do { again:
ret = recvmsg(fd, &msg, MSG_CMSG_CLOEXEC); ret = recvmsg(fd, &msg, MSG_CMSG_CLOEXEC);
} while (ret < 0 && errno == EINTR); if (ret < 0) {
if (ret < 0 || ret == 0) if (errno == EINTR)
return ret; goto again;
return syserrno(-errno, "Failed to receive response");
}
if (ret == 0)
return 0;
/* If SO_PASSCRED is set we will always get a ucred message. */
for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
__u32 idx;
/* /*
* If SO_PASSCRED is set we will always get a ucred message. * This causes some compilers to complaing about
* increased alignment requirements but I haven't found
* a better way to deal with this yet. Suggestions
* welcome!
*/ */
for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { #pragma GCC diagnostic push
if (cmsg->cmsg_type != SCM_RIGHTS) #pragma GCC diagnostic ignored "-Wcast-align"
continue; int *fds_raw = (int *)CMSG_DATA(cmsg);
#pragma GCC diagnostic pop
memset(recvfds, -1, num_recvfds * sizeof(int)); __u32 num_raw = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
if (cmsg &&
cmsg->cmsg_len == CMSG_LEN(num_recvfds * sizeof(int)) && /*
cmsg->cmsg_level == SOL_SOCKET) * We received an insane amount of file descriptors
memcpy(recvfds, CMSG_DATA(cmsg), num_recvfds * sizeof(int)); * which exceeds the kernel limit we know about so
* close them and return an error.
*/
if (num_raw > KERNEL_SCM_MAX_FD) {
for (idx = 0; idx < num_raw; idx++)
close(fds_raw[idx]);
return syserrno_set(-EFBIG, "Received excessive number of file descriptors");
}
if (ret_fds->fd_count_max > num_raw) {
/*
* Make sure any excess entries in the fd array
* are set to -EBADF so our cleanup functions
* can safely be called.
*/
for (idx = num_raw; idx < ret_fds->fd_count_max; idx++)
ret_fds->fd[idx] = -EBADF;
WARN("Received fewer file descriptors than we expected %u != %u", ret_fds->fd_count_max, num_raw);
} else if (ret_fds->fd_count_max < num_raw) {
/* Make sure we close any excess fds we received. */
for (idx = ret_fds->fd_count_max; idx < num_raw; idx++)
close(fds_raw[idx]);
WARN("Received more file descriptors than we expected %u != %u", ret_fds->fd_count_max, num_raw);
/* Cap the number of received file descriptors. */
num_raw = ret_fds->fd_count_max;
}
memcpy(ret_fds->fd, CMSG_DATA(cmsg), num_raw * sizeof(int));
ret_fds->fd_count_ret = num_raw;
break; break;
} }
}
return ret; return ret;
} }
int lxc_abstract_unix_recv_fds(int fd, int *recvfds, int num_recvfds, ssize_t lxc_abstract_unix_recv_fds(int fd, struct unix_fds *ret_fds,
void *data, size_t size) void *ret_data, size_t size_ret_data)
{ {
char buf[1] = {0}; char buf[1] = {};
struct iovec iov = { struct iovec iov = {
.iov_base = data ? data : buf, .iov_base = ret_data ? ret_data : buf,
.iov_len = data ? size : sizeof(buf), .iov_len = ret_data ? size_ret_data : sizeof(buf),
}; };
return lxc_abstract_unix_recv_fds_iov(fd, recvfds, num_recvfds, &iov, 1); ssize_t ret;
ret = lxc_abstract_unix_recv_fds_iov(fd, ret_fds, &iov, 1);
if (ret < 0)
return ret;
return ret;
}
ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd, void *ret_data,
size_t size_ret_data)
{
call_cleaner(put_unix_fds) struct unix_fds *fds = NULL;
char buf[1] = {};
struct iovec iov = {
.iov_base = ret_data ? ret_data : buf,
.iov_len = ret_data ? size_ret_data : sizeof(buf),
};
ssize_t ret;
fds = &(struct unix_fds){
.fd_count_max = 1,
};
ret = lxc_abstract_unix_recv_fds_iov(fd, fds, &iov, 1);
if (ret < 0)
return ret;
if (ret == 0)
return ret_errno(ENODATA);
if (fds->fd_count_ret != fds->fd_count_max)
*ret_fd = -EBADF;
else
*ret_fd = move_fd(fds->fd[0]);
return ret;
}
ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd)
{
call_cleaner(put_unix_fds) struct unix_fds *fds = NULL;
char buf[1] = {};
struct iovec iov = {
.iov_base = buf,
.iov_len = sizeof(buf),
};
ssize_t ret;
fds = &(struct unix_fds){
.fd_count_max = 2,
};
ret = lxc_abstract_unix_recv_fds_iov(fd, fds, &iov, 1);
if (ret < 0)
return ret;
if (ret == 0)
return ret_errno(ENODATA);
if (fds->fd_count_ret != fds->fd_count_max) {
ret_fd[0] = -EBADF;
ret_fd[1] = -EBADF;
} else {
ret_fd[0] = move_fd(fds->fd[0]);
ret_fd[1] = move_fd(fds->fd[1]);
}
return 0;
} }
int lxc_abstract_unix_send_credential(int fd, void *data, size_t size) int lxc_abstract_unix_send_credential(int fd, void *data, size_t size)
......
...@@ -5,9 +5,24 @@ ...@@ -5,9 +5,24 @@
#include <stdio.h> #include <stdio.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <stddef.h>
#include <sys/un.h> #include <sys/un.h>
#include "compiler.h" #include "compiler.h"
#include "macro.h"
#include "memory_utils.h"
/*
* Technically 253 is the kernel limit but we want to the struct to be a
* multiple of 8.
*/
#define KERNEL_SCM_MAX_FD 252
struct unix_fds {
__u32 fd_count_max;
__u32 fd_count_ret;
__s32 fd[KERNEL_SCM_MAX_FD];
} __attribute__((aligned(8)));
/* does not enforce \0-termination */ /* does not enforce \0-termination */
__hidden extern int lxc_abstract_unix_open(const char *path, int type, int flags); __hidden extern int lxc_abstract_unix_open(const char *path, int type, int flags);
...@@ -15,14 +30,29 @@ __hidden extern void lxc_abstract_unix_close(int fd); ...@@ -15,14 +30,29 @@ __hidden extern void lxc_abstract_unix_close(int fd);
/* does not enforce \0-termination */ /* does not enforce \0-termination */
__hidden extern int lxc_abstract_unix_connect(const char *path); __hidden extern int lxc_abstract_unix_connect(const char *path);
__hidden extern int lxc_abstract_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, __hidden extern int lxc_abstract_unix_send_fds(int fd, const int *sendfds,
size_t size) __access_r(2, 3) __access_r(4, 5); int num_sendfds, void *data,
size_t size) __access_r(2, 3)
__access_r(4, 5);
__hidden extern int lxc_abstract_unix_send_fds_iov(int fd, const int *sendfds,
int num_sendfds,
struct iovec *iov,
size_t iovlen)
__access_r(2, 3);
__hidden extern ssize_t lxc_abstract_unix_recv_fds(int fd,
struct unix_fds *ret_fds,
void *ret_data,
size_t size_ret_data)
__access_r(3, 4);
__hidden extern int lxc_abstract_unix_send_fds_iov(int fd, int *sendfds, int num_sendfds, __hidden extern ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd,
struct iovec *iov, size_t iovlen) __access_r(2, 3); void *ret_data,
size_t size_ret_data)
__access_r(3, 4);
__hidden extern int lxc_abstract_unix_recv_fds(int fd, int *recvfds, int num_recvfds, void *data, __hidden extern ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd);
size_t size) __access_r(2, 3) __access_r(4, 5);
__hidden extern int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, size_t size); __hidden extern int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, size_t size);
...@@ -37,4 +67,13 @@ __hidden extern int lxc_unix_connect(struct sockaddr_un *addr); ...@@ -37,4 +67,13 @@ __hidden extern int lxc_unix_connect(struct sockaddr_un *addr);
__hidden extern int lxc_unix_connect_type(struct sockaddr_un *addr, int type); __hidden extern int lxc_unix_connect_type(struct sockaddr_un *addr, int type);
__hidden extern int lxc_socket_set_timeout(int fd, int rcv_timeout, int snd_timeout); __hidden extern int lxc_socket_set_timeout(int fd, int rcv_timeout, int snd_timeout);
static inline void put_unix_fds(struct unix_fds *fds)
{
if (!IS_ERR_OR_NULL(fds)) {
for (size_t idx = 0; idx < fds->fd_count_ret; idx++)
close_prot_errno_disarm(fds->fd[idx]);
}
}
define_cleanup_function(struct unix_fds *, put_unix_fds);
#endif /* __LXC_AF_UNIX_H */ #endif /* __LXC_AF_UNIX_H */
...@@ -164,7 +164,7 @@ static inline bool sync_wake_fd(int fd, int fd_send) ...@@ -164,7 +164,7 @@ static inline bool sync_wake_fd(int fd, int fd_send)
static inline bool sync_wait_fd(int fd, int *fd_recv) static inline bool sync_wait_fd(int fd, int *fd_recv)
{ {
return lxc_abstract_unix_recv_fds(fd, fd_recv, 1, NULL, 0) > 0; return lxc_abstract_unix_recv_one_fd(fd, fd_recv, NULL, 0) > 0;
} }
static bool attach_lsm(lxc_attach_options_t *options) static bool attach_lsm(lxc_attach_options_t *options)
...@@ -400,7 +400,6 @@ static int get_attach_context(struct attach_context *ctx, ...@@ -400,7 +400,6 @@ static int get_attach_context(struct attach_context *ctx,
ctx->init_pid = pidfd_get_pid(ctx->dfd_self_pid, ctx->init_pidfd); ctx->init_pid = pidfd_get_pid(ctx->dfd_self_pid, ctx->init_pidfd);
else else
ctx->init_pid = lxc_cmd_get_init_pid(container->name, container->config_path); ctx->init_pid = lxc_cmd_get_init_pid(container->name, container->config_path);
if (ctx->init_pid < 0) if (ctx->init_pid < 0)
return log_error(-1, "Failed to get init pid"); return log_error(-1, "Failed to get init pid");
...@@ -488,16 +487,16 @@ static int same_nsfd(int dfd_pid1, int dfd_pid2, const char *ns_path) ...@@ -488,16 +487,16 @@ static int same_nsfd(int dfd_pid1, int dfd_pid2, const char *ns_path)
ret = fstatat(dfd_pid1, ns_path, &ns_st1, 0); ret = fstatat(dfd_pid1, ns_path, &ns_st1, 0);
if (ret) if (ret)
return -1; return -errno;
ret = fstatat(dfd_pid2, ns_path, &ns_st2, 0); ret = fstatat(dfd_pid2, ns_path, &ns_st2, 0);
if (ret) if (ret)
return -1; return -errno;
/* processes are in the same namespace */ /* processes are in the same namespace */
if ((ns_st1.st_dev == ns_st2.st_dev) && if ((ns_st1.st_dev == ns_st2.st_dev) &&
(ns_st1.st_ino == ns_st2.st_ino)) (ns_st1.st_ino == ns_st2.st_ino))
return -EINVAL; return 1;
return 0; return 0;
} }
...@@ -511,19 +510,23 @@ static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path) ...@@ -511,19 +510,23 @@ static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path)
(PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS & (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)), 0); ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)), 0);
if (ns_fd2 < 0) { if (ns_fd2 < 0) {
/* The kernel does not support this namespace. This is not an error. */
if (errno == ENOENT) if (errno == ENOENT)
return -EINVAL; return -ENOENT;
return log_error_errno(-errno, errno, "Failed to open %d(%s)", return syserrno(-errno, "Failed to open %d(%s)", dfd_pid2, ns_path);
dfd_pid2, ns_path);
} }
ret = same_nsfd(dfd_pid1, dfd_pid2, ns_path); ret = same_nsfd(dfd_pid1, dfd_pid2, ns_path);
if (ret < 0) switch (ret) {
return ret; case -ENOENT:
__fallthrough;
case 1:
return ret_errno(ENOENT);
case 0:
/* processes are in different namespaces */ /* processes are in different namespaces */
return move_fd(ns_fd2); return move_fd(ns_fd2);
}
return ret;
} }
static int __prepare_namespaces_pidfd(struct attach_context *ctx) static int __prepare_namespaces_pidfd(struct attach_context *ctx)
...@@ -537,14 +540,19 @@ static int __prepare_namespaces_pidfd(struct attach_context *ctx) ...@@ -537,14 +540,19 @@ static int __prepare_namespaces_pidfd(struct attach_context *ctx)
ret = same_nsfd(ctx->dfd_self_pid, ret = same_nsfd(ctx->dfd_self_pid,
ctx->dfd_init_pid, ctx->dfd_init_pid,
ns_info[i].proc_path); ns_info[i].proc_path);
if (ret == -EINVAL) switch (ret) {
case -ENOENT:
__fallthrough;
case 1:
ctx->ns_inherited &= ~ns_info[i].clone_flag; ctx->ns_inherited &= ~ns_info[i].clone_flag;
else if (ret < 0) break;
return log_error_errno(-1, errno, case 0:
"Failed to determine whether %s namespace is shared",
ns_info[i].proc_name);
else
TRACE("Shared %s namespace needs attach", ns_info[i].proc_name); TRACE("Shared %s namespace needs attach", ns_info[i].proc_name);
break;
}
return syserrno(-errno, "Failed to determine whether %s namespace is shared",
ns_info[i].proc_name);
} }
return 0; return 0;
...@@ -573,7 +581,7 @@ static int __prepare_namespaces_nsfd(struct attach_context *ctx, ...@@ -573,7 +581,7 @@ static int __prepare_namespaces_nsfd(struct attach_context *ctx,
if (ctx->ns_fd[i] >= 0) if (ctx->ns_fd[i] >= 0)
continue; continue;
if (ctx->ns_fd[i] == -EINVAL) { if (ctx->ns_fd[i] == -ENOENT) {
ctx->ns_inherited &= ~ns_info[i].clone_flag; ctx->ns_inherited &= ~ns_info[i].clone_flag;
continue; continue;
} }
......
...@@ -1305,6 +1305,9 @@ static int chown_cgroup_wrapper(void *data) ...@@ -1305,6 +1305,9 @@ static int chown_cgroup_wrapper(void *data)
for (int i = 0; arg->hierarchies[i]; i++) { for (int i = 0; arg->hierarchies[i]; i++) {
int dirfd = arg->hierarchies[i]->dfd_con; int dirfd = arg->hierarchies[i]->dfd_con;
if (dirfd < 0)
return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
(void)fchowmodat(dirfd, "", destuid, nsgid, 0775); (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
/* /*
...@@ -1361,7 +1364,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops, ...@@ -1361,7 +1364,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
return true; return true;
} }
__cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops) __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
{ {
if (!ops) if (!ops)
return; return;
...@@ -1371,15 +1374,12 @@ __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops) ...@@ -1371,15 +1374,12 @@ __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
for (int i = 0; ops->hierarchies[i]; i++) { for (int i = 0; ops->hierarchies[i]; i++) {
struct hierarchy *h = ops->hierarchies[i]; struct hierarchy *h = ops->hierarchies[i];
/*
* we don't keep the fds for non-unified hierarchies around /* Close all monitor cgroup file descriptors. */
* mainly because we don't make use of them anymore after the close_prot_errno_disarm(h->dfd_mon);
* core cgroup setup is done but also because there are quite a
* lot of them.
*/
if (!is_unified_hierarchy(h))
close_prot_errno_disarm(h->dfd_con);
} }
/* Close the cgroup root file descriptor. */
close_prot_errno_disarm(ops->dfd_mnt);
/* /*
* The checking for freezer support should obviously be done at cgroup * The checking for freezer support should obviously be done at cgroup
...@@ -2183,8 +2183,8 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, ...@@ -2183,8 +2183,8 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
size_t pidstr_len; size_t pidstr_len;
ssize_t ret; ssize_t ret;
ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0); ret = lxc_abstract_unix_recv_two_fds(sk, target_fds);
if (ret <= 0) if (ret < 0)
return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
target_fd0 = target_fds[0]; target_fd0 = target_fds[0];
target_fd1 = target_fds[1]; target_fd1 = target_fds[1];
...@@ -3322,7 +3322,7 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) ...@@ -3322,7 +3322,7 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers; cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
cgfsng_ops->payload_create = cgfsng_payload_create; cgfsng_ops->payload_create = cgfsng_payload_create;
cgfsng_ops->payload_enter = cgfsng_payload_enter; cgfsng_ops->payload_enter = cgfsng_payload_enter;
cgfsng_ops->payload_finalize = cgfsng_payload_finalize; cgfsng_ops->finalize = cgfsng_finalize;
cgfsng_ops->get_cgroup = cgfsng_get_cgroup; cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
cgfsng_ops->get = cgfsng_get; cgfsng_ops->get = cgfsng_get;
cgfsng_ops->set = cgfsng_set; cgfsng_ops->set = cgfsng_set;
...@@ -3345,23 +3345,14 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) ...@@ -3345,23 +3345,14 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
return move_ptr(cgfsng_ops); return move_ptr(cgfsng_ops);
} }
int cgroup_attach(const struct lxc_conf *conf, const char *name, static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
const char *lxcpath, pid_t pid)
{ {
__do_close int unified_fd = -EBADF;
int ret; int ret;
if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
return ret_errno(EINVAL);
unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
if (unified_fd < 0)
return ret_errno(ENOCGROUP2);
if (!lxc_list_empty(&conf->id_map)) { if (!lxc_list_empty(&conf->id_map)) {
struct userns_exec_unified_attach_data args = { struct userns_exec_unified_attach_data args = {
.conf = conf, .conf = conf,
.unified_fd = unified_fd, .unified_fd = fd_unified,
.pid = pid, .pid = pid,
}; };
...@@ -3375,7 +3366,76 @@ int cgroup_attach(const struct lxc_conf *conf, const char *name, ...@@ -3375,7 +3366,76 @@ int cgroup_attach(const struct lxc_conf *conf, const char *name,
cgroup_unified_attach_child_wrapper, cgroup_unified_attach_child_wrapper,
&args); &args);
} else { } else {
ret = cgroup_attach_leaf(conf, unified_fd, pid); ret = cgroup_attach_leaf(conf, fd_unified, pid);
}
return ret;
}
static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
const char *lxcpath, pid_t pid)
{
call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
int ret;
char pidstr[INTTYPE_TO_STRLEN(pid_t)];
size_t idx;
ssize_t pidstr_len;
ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, NULL, true,
sizeof(struct cgroup_ctx), ctx);
if (ret < 0)
return ret_errno(ENOSYS);
pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
if (pidstr_len < 0)
return pidstr_len;
for (idx = 0; idx < ctx->fd_len; idx++) {
int dfd_con = ctx->fd[idx];
if (unified_cgroup_fd(dfd_con))
ret = __unified_attach_fd(conf, dfd_con, pid);
else
ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
if (ret)
return syserrno(ret, "Failed to attach to cgroup fd %d", dfd_con);
else
TRACE("Attached to cgroup fd %d", dfd_con);
}
if (idx == 0)
return syserrno_set(-ENOENT, "Failed to attach to cgroups");
TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->cgroup_layout));
return 0;
}
static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
const char *lxcpath, pid_t pid)
{
__do_close int dfd_unified = -EBADF;
if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
return ret_errno(EINVAL);
dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
if (dfd_unified < 0)
return ret_errno(ENOCGROUP2);
return __unified_attach_fd(conf, dfd_unified, pid);
}
int cgroup_attach(const struct lxc_conf *conf, const char *name,
const char *lxcpath, pid_t pid)
{
int ret;
ret = __cgroup_attach_many(conf, name, lxcpath, pid);
if (ret < 0) {
if (ret != ENOSYS)
return ret;
ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
} }
return ret; return ret;
......
...@@ -5,9 +5,11 @@ ...@@ -5,9 +5,11 @@
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h> #include <stddef.h>
#include <linux/types.h>
#include <sys/types.h> #include <sys/types.h>
#include <linux/magic.h> #include <linux/magic.h>
#include "af_unix.h"
#include "compiler.h" #include "compiler.h"
#include "macro.h" #include "macro.h"
#include "memory_utils.h" #include "memory_utils.h"
...@@ -33,6 +35,22 @@ typedef enum { ...@@ -33,6 +35,22 @@ typedef enum {
CGROUP_LAYOUT_UNIFIED = 2, CGROUP_LAYOUT_UNIFIED = 2,
} cgroup_layout_t; } cgroup_layout_t;
static inline const char *cgroup_layout_name(cgroup_layout_t layout)
{
switch (layout) {
case CGROUP_LAYOUT_LEGACY:
return "legacy";
case CGROUP_LAYOUT_HYBRID:
return "hybrid";
case CGROUP_LAYOUT_UNIFIED:
return "unified";
case CGROUP_LAYOUT_UNKNOWN:
break;
}
return "unknown";
}
typedef enum { typedef enum {
LEGACY_HIERARCHY = CGROUP_SUPER_MAGIC, LEGACY_HIERARCHY = CGROUP_SUPER_MAGIC,
UNIFIED_HIERARCHY = CGROUP2_SUPER_MAGIC, UNIFIED_HIERARCHY = CGROUP2_SUPER_MAGIC,
...@@ -41,6 +59,17 @@ typedef enum { ...@@ -41,6 +59,17 @@ typedef enum {
#define DEVICES_CONTROLLER (1U << 0) #define DEVICES_CONTROLLER (1U << 0)
#define FREEZER_CONTROLLER (1U << 1) #define FREEZER_CONTROLLER (1U << 1)
/* That's plenty of hierarchies. */
#define CGROUP_CTX_MAX_FD 20
// BUILD_BUG_ON(CGROUP_CTX_MAX_FD > KERNEL_SCM_MAX_FD);
struct cgroup_ctx {
__s32 cgroup_layout;
__u32 utilities;
__u32 fd_len;
__s32 fd[CGROUP_CTX_MAX_FD];
} __attribute__((aligned(8)));
/* A descriptor for a mounted hierarchy /* A descriptor for a mounted hierarchy
* *
* @controllers * @controllers
...@@ -218,7 +247,7 @@ struct cgroup_ops { ...@@ -218,7 +247,7 @@ struct cgroup_ops {
struct lxc_handler *handler); struct lxc_handler *handler);
bool (*monitor_delegate_controllers)(struct cgroup_ops *ops); bool (*monitor_delegate_controllers)(struct cgroup_ops *ops);
bool (*payload_delegate_controllers)(struct cgroup_ops *ops); bool (*payload_delegate_controllers)(struct cgroup_ops *ops);
void (*payload_finalize)(struct cgroup_ops *ops); void (*finalize)(struct cgroup_ops *ops);
const char *(*get_limiting_cgroup)(struct cgroup_ops *ops, const char *controller); const char *(*get_limiting_cgroup)(struct cgroup_ops *ops, const char *controller);
}; };
...@@ -257,4 +286,36 @@ static inline int cgroup_unified_fd(const struct cgroup_ops *ops) ...@@ -257,4 +286,36 @@ static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
__first, __VA_ARGS__); \ __first, __VA_ARGS__); \
}) })
static void put_cgroup_ctx(struct cgroup_ctx *ctx)
{
if (!IS_ERR_OR_NULL(ctx)) {
for (__u32 idx = 0; idx < ctx->fd_len; idx++)
close_prot_errno_disarm(ctx->fd[idx]);
}
}
define_cleanup_function(struct cgroup_ctx *, put_cgroup_ctx);
static inline int prepare_cgroup_ctx(struct cgroup_ops *ops,
struct cgroup_ctx *ctx)
{
__u32 idx;
for (idx = 0; ops->hierarchies[idx]; idx++) {
if (idx >= CGROUP_CTX_MAX_FD)
return ret_errno(E2BIG);
ctx->fd[idx] = ops->hierarchies[idx]->dfd_con;
}
if (idx == 0)
return ret_errno(ENOENT);
ctx->fd_len = idx;
ctx->cgroup_layout = ops->cgroup_layout;
if (ops->unified && ops->unified->dfd_con > 0)
ctx->utilities = ops->unified->utilities;
return 0;
}
#endif /* __LXC_CGROUP_H */ #endif /* __LXC_CGROUP_H */
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <unistd.h> #include <unistd.h>
#include "compiler.h" #include "compiler.h"
#include "cgroups/cgroup.h"
#include "lxccontainer.h" #include "lxccontainer.h"
#include "macro.h" #include "macro.h"
#include "state.h" #include "state.h"
...@@ -20,29 +21,30 @@ ...@@ -20,29 +21,30 @@
#define LXC_CMD_REAP_CLIENT_FD 1 #define LXC_CMD_REAP_CLIENT_FD 1
typedef enum { typedef enum {
LXC_CMD_CONSOLE, LXC_CMD_CONSOLE = 0,
LXC_CMD_TERMINAL_WINCH, LXC_CMD_TERMINAL_WINCH = 1,
LXC_CMD_STOP, LXC_CMD_STOP = 2,
LXC_CMD_GET_STATE, LXC_CMD_GET_STATE = 3,
LXC_CMD_GET_INIT_PID, LXC_CMD_GET_INIT_PID = 4,
LXC_CMD_GET_CLONE_FLAGS, LXC_CMD_GET_CLONE_FLAGS = 5,
LXC_CMD_GET_CGROUP, LXC_CMD_GET_CGROUP = 6,
LXC_CMD_GET_CONFIG_ITEM, LXC_CMD_GET_CONFIG_ITEM = 7,
LXC_CMD_GET_NAME, LXC_CMD_GET_NAME = 8,
LXC_CMD_GET_LXCPATH, LXC_CMD_GET_LXCPATH = 9,
LXC_CMD_ADD_STATE_CLIENT, LXC_CMD_ADD_STATE_CLIENT = 10,
LXC_CMD_CONSOLE_LOG, LXC_CMD_CONSOLE_LOG = 11,
LXC_CMD_SERVE_STATE_CLIENTS, LXC_CMD_SERVE_STATE_CLIENTS = 12,
LXC_CMD_SECCOMP_NOTIFY_ADD_LISTENER, LXC_CMD_SECCOMP_NOTIFY_ADD_LISTENER = 13,
LXC_CMD_ADD_BPF_DEVICE_CGROUP, LXC_CMD_ADD_BPF_DEVICE_CGROUP = 14,
LXC_CMD_FREEZE, LXC_CMD_FREEZE = 15,
LXC_CMD_UNFREEZE, LXC_CMD_UNFREEZE = 16,
LXC_CMD_GET_CGROUP2_FD, LXC_CMD_GET_CGROUP2_FD = 17,
LXC_CMD_GET_INIT_PIDFD, LXC_CMD_GET_INIT_PIDFD = 18,
LXC_CMD_GET_LIMITING_CGROUP, LXC_CMD_GET_LIMITING_CGROUP = 19,
LXC_CMD_GET_LIMITING_CGROUP2_FD, LXC_CMD_GET_LIMITING_CGROUP2_FD = 20,
LXC_CMD_GET_DEVPTS_FD, LXC_CMD_GET_DEVPTS_FD = 21,
LXC_CMD_GET_SECCOMP_NOTIFY_FD, LXC_CMD_GET_SECCOMP_NOTIFY_FD = 22,
LXC_CMD_GET_CGROUP_CTX = 23,
LXC_CMD_MAX, LXC_CMD_MAX,
} lxc_cmd_t; } lxc_cmd_t;
...@@ -122,6 +124,11 @@ __hidden extern int lxc_try_cmd(const char *name, const char *lxcpath); ...@@ -122,6 +124,11 @@ __hidden extern int lxc_try_cmd(const char *name, const char *lxcpath);
__hidden extern int lxc_cmd_console_log(const char *name, const char *lxcpath, __hidden extern int lxc_cmd_console_log(const char *name, const char *lxcpath,
struct lxc_console_log *log); struct lxc_console_log *log);
__hidden extern int lxc_cmd_get_seccomp_notify_fd(const char *name, const char *lxcpath); __hidden extern int lxc_cmd_get_seccomp_notify_fd(const char *name, const char *lxcpath);
__hidden extern int lxc_cmd_get_cgroup_ctx(const char *name, const char *lxcpath,
const char *controller, bool batch,
size_t size_ret_ctx,
struct cgroup_ctx *ret_ctx)
__access_r(6, 5);
__hidden extern int lxc_cmd_seccomp_notify_add_listener(const char *name, const char *lxcpath, int fd, __hidden extern int lxc_cmd_seccomp_notify_add_listener(const char *name, const char *lxcpath, int fd,
/* unused */ unsigned int command, /* unused */ unsigned int command,
/* unused */ unsigned int flags); /* unused */ unsigned int flags);
......
...@@ -1509,8 +1509,10 @@ int lxc_setup_devpts_parent(struct lxc_handler *handler) ...@@ -1509,8 +1509,10 @@ int lxc_setup_devpts_parent(struct lxc_handler *handler)
if (handler->conf->pty_max <= 0) if (handler->conf->pty_max <= 0)
return 0; return 0;
ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1, ret = lxc_abstract_unix_recv_one_fd(handler->data_sock[1],
&handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd)); &handler->conf->devpts_fd,
&handler->conf->devpts_fd,
sizeof(handler->conf->devpts_fd));
if (ret < 0) if (ret < 0)
return log_error_errno(-1, errno, "Failed to receive devpts fd from child"); return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
......
...@@ -530,6 +530,14 @@ __lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \ ...@@ -530,6 +530,14 @@ __lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \
__internal_ret__; \ __internal_ret__; \
}) })
#define syswarn_set(__ret__, format, ...) \
({ \
typeof(__ret__) __internal_ret__ = (__ret__); \
errno = abs(__ret__); \
SYSWARN(format, ##__VA_ARGS__); \
__internal_ret__; \
})
#define log_error(__ret__, format, ...) \ #define log_error(__ret__, format, ...) \
({ \ ({ \
typeof(__ret__) __internal_ret__ = (__ret__); \ typeof(__ret__) __internal_ret__ = (__ret__); \
......
...@@ -737,4 +737,12 @@ static inline int PTR_RET(const void *ptr) ...@@ -737,4 +737,12 @@ static inline int PTR_RET(const void *ptr)
return 0; return 0;
} }
#define min(x, y) \
({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void)(&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; \
})
#endif /* __LXC_MACRO_H */ #endif /* __LXC_MACRO_H */
...@@ -101,7 +101,7 @@ static inline void *memdup(const void *data, size_t len) ...@@ -101,7 +101,7 @@ static inline void *memdup(const void *data, size_t len)
({ \ ({ \
if (a >= 0 && a != b) \ if (a >= 0 && a != b) \
close(a); \ close(a); \
if (close >= 0) \ if (b >= 0) \
close(b); \ close(b); \
a = b = -EBADF; \ a = b = -EBADF; \
}) })
......
...@@ -1637,9 +1637,9 @@ int lxc_seccomp_recv_notifier_fd(struct lxc_seccomp *seccomp, int socket_fd) ...@@ -1637,9 +1637,9 @@ int lxc_seccomp_recv_notifier_fd(struct lxc_seccomp *seccomp, int socket_fd)
if (seccomp->notifier.wants_supervision) { if (seccomp->notifier.wants_supervision) {
int ret; int ret;
ret = lxc_abstract_unix_recv_fds(socket_fd, ret = lxc_abstract_unix_recv_one_fd(socket_fd,
&seccomp->notifier.notify_fd, &seccomp->notifier.notify_fd,
1, NULL, 0); NULL, 0);
if (ret < 0) if (ret < 0)
return -1; return -1;
} }
......
...@@ -1041,7 +1041,7 @@ static int do_start(void *data) ...@@ -1041,7 +1041,7 @@ static int do_start(void *data)
lxc_sync_fini_parent(handler); lxc_sync_fini_parent(handler);
if (lxc_abstract_unix_recv_fds(data_sock1, &status_fd, 1, NULL, 0) < 0) { if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
ERROR("Failed to receive status file descriptor to child process"); ERROR("Failed to receive status file descriptor to child process");
goto out_warn_father; goto out_warn_father;
} }
...@@ -1460,7 +1460,7 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler) ...@@ -1460,7 +1460,7 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
for (i = 0; i < conf->ttys.max; i++) { for (i = 0; i < conf->ttys.max; i++) {
int ttyfds[2]; int ttyfds[2];
ret = lxc_abstract_unix_recv_fds(sock, ttyfds, 2, NULL, 0); ret = lxc_abstract_unix_recv_two_fds(sock, ttyfds);
if (ret < 0) if (ret < 0)
break; break;
...@@ -1888,7 +1888,7 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1888,7 +1888,7 @@ static int lxc_spawn(struct lxc_handler *handler)
} }
} }
cgroup_ops->payload_finalize(cgroup_ops); cgroup_ops->finalize(cgroup_ops);
TRACE("Finished setting up cgroups"); TRACE("Finished setting up cgroups");
if (handler->ns_unshare_flags & CLONE_NEWTIME) { if (handler->ns_unshare_flags & CLONE_NEWTIME) {
......
...@@ -36,7 +36,7 @@ static const char *const strstate[] = { ...@@ -36,7 +36,7 @@ static const char *const strstate[] = {
const char *lxc_state2str(lxc_state_t state) const char *lxc_state2str(lxc_state_t state)
{ {
if (state < STOPPED || state > MAX_STATE - 1) if (state < STOPPED || state > MAX_STATE - 1)
return NULL; return "INVALID STATE";
return strstate[state]; return strstate[state];
} }
......
...@@ -245,4 +245,30 @@ __hidden extern int safe_mount_beneath_at(int beneat_fd, const char *src, const ...@@ -245,4 +245,30 @@ __hidden extern int safe_mount_beneath_at(int beneat_fd, const char *src, const
const char *fstype, unsigned int flags, const void *data); const char *fstype, unsigned int flags, const void *data);
__hidden __lxc_unused int print_r(int fd, const char *path); __hidden __lxc_unused int print_r(int fd, const char *path);
static inline int copy_struct_from_client(__u32 server_size, void *dst,
__u32 client_size, const void *src)
{
__u32 size = min(server_size, client_size);
__u32 rest = min(server_size, client_size) - size;
/* Deal with trailing bytes. */
if (client_size < server_size) {
memset(dst + size, 0, rest);
} else if (client_size > server_size) {
/* TODO: Actually come up with a nice way to test for 0. */
return 0;
}
memcpy(dst, src, size);
return 0;
}
static inline __u32 copy_struct_to_client(__u32 client_size, void *dst,
__u32 server_size, const void *src)
{
__u32 size = min(server_size, client_size);
memcpy(dst, src, size);
return size;
}
#endif /* __LXC_UTILS_H */ #endif /* __LXC_UTILS_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment