Unverified Commit 567104e5 by Stéphane Graber Committed by GitHub

Merge pull request #3513 from brauner/2020-08-09/openat2

openat2() and safe mounting
parents da0fdceb d43d5191
......@@ -622,7 +622,8 @@ AC_CHECK_HEADER([ifaddrs.h],
AC_HEADER_MAJOR
# Check for some syscalls functions
AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3 fsopen fspick fsconfig fsmount])
AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3 fsopen fspick fsconfig fsmount, openat2])
AC_CHECK_TYPES([struct open_how], [], [], [[#include <linux/openat2.h>]])
AC_CHECK_TYPES([struct clone_args], [], [], [[#include <linux/sched.h>]])
AC_CHECK_MEMBERS([struct clone_args.set_tid],[],[],[[#include <linux/sched.h>]])
AC_CHECK_MEMBERS([struct clone_args.cgroup],[],[],[[#include <linux/sched.h>]])
......
......@@ -1872,9 +1872,17 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
}
/* mount tmpfs */
ret = safe_mount(NULL, cgroup_root, "tmpfs",
MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
"size=10240k,mode=755", root);
ret = safe_mount_beneath(root, NULL, DEFAULT_CGROUP_MOUNTPOINT, "tmpfs",
MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
"size=10240k,mode=755");
if (ret < 0) {
if (errno != ENOSYS)
return false;
ret = safe_mount(NULL, cgroup_root, "tmpfs",
MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
"size=10240k,mode=755", root);
}
if (ret < 0)
return false;
......
......@@ -1051,50 +1051,50 @@ on_error:
static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
int autodevtmpfssize, const char *lxcpath)
{
__do_free char *path = NULL;
const char *path = rootfs->path ? rootfs->mount : NULL;
int ret;
size_t clen;
mode_t cur_mask;
char mount_options[128];
INFO("Preparing \"/dev\"");
/* $(rootfs->mount) + "/dev/pts" + '\0' */
clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
path = must_realloc(NULL, clen);
sprintf(mount_options, "size=%d,mode=755", (autodevtmpfssize != 0) ? autodevtmpfssize : 500000);
DEBUG("Using mount options: %s", mount_options);
ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
if (ret < 0 || (size_t)ret >= clen)
return -1;
cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
ret = mkdirat(rootfs->mntpt_fd, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if (ret < 0 && errno != EEXIST) {
SYSERROR("Failed to create \"/dev\" directory");
ret = -errno;
goto reset_umask;
}
ret = safe_mount("none", path, "tmpfs", 0, mount_options,
rootfs->path ? rootfs->mount : NULL );
ret = safe_mount_beneath_at(rootfs->mntpt_fd, "none", "dev", "tmpfs", 0, mount_options);
if (ret < 0) {
SYSERROR("Failed to mount tmpfs on \"%s\"", path);
goto reset_umask;
}
TRACE("Mounted tmpfs on \"%s\"", path);
__do_free char *fallback_path = NULL;
ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
if (ret < 0 || (size_t)ret >= clen) {
ret = -1;
goto reset_umask;
if (errno != ENOSYS) {
SYSERROR("Failed to mount tmpfs on \"%s\"", path);
goto reset_umask;
}
if (path) {
fallback_path = must_make_path(path, "/dev", NULL);
ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
} else {
ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
}
if (ret < 0) {
SYSERROR("Failed to mount tmpfs on \"%s\"", path);
goto reset_umask;
}
}
TRACE("Mounted tmpfs on \"%s\"", path);
/* If we are running on a devtmpfs mapping, dev/pts may already exist.
* If not, then create it and exit if that fails...
*/
ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
ret = mkdirat(rootfs->mntpt_fd, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if (ret < 0 && errno != EEXIST) {
SYSERROR("Failed to create directory \"%s\"", path);
ret = -errno;
......@@ -1136,39 +1136,33 @@ enum {
static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
{
__do_close int dev_dir_fd = -EBADF;
int i, ret;
char path[PATH_MAX];
mode_t cmask;
int use_mknod = LXC_DEVNODE_MKNOD;
ret = snprintf(path, PATH_MAX, "%s/dev",
rootfs->path ? rootfs->mount : "");
if (ret < 0 || ret >= PATH_MAX)
return -1;
/* ignore, just don't try to fill in */
if (!dir_exists(path))
if (!exists_dir_at(rootfs->mntpt_fd, "dev"))
return 0;
dev_dir_fd = openat(rootfs->mntpt_fd, "dev/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOFOLLOW);
if (dev_dir_fd < 0)
return -errno;
INFO("Populating \"/dev\"");
cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
char hostpath[PATH_MAX];
char hostpath[PATH_MAX], path[PATH_MAX];
const struct lxc_device_node *device = &lxc_devices[i];
ret = snprintf(path, PATH_MAX, "%s/dev/%s",
rootfs->path ? rootfs->mount : "", device->name);
if (ret < 0 || ret >= PATH_MAX)
return -1;
if (use_mknod >= LXC_DEVNODE_MKNOD) {
ret = mknod(path, device->mode, makedev(device->maj, device->min));
ret = mknodat(dev_dir_fd, device->name, device->mode, makedev(device->maj, device->min));
if (ret == 0 || (ret < 0 && errno == EEXIST)) {
DEBUG("Created device node \"%s\"", path);
DEBUG("Created device node \"%s\"", device->name);
} else if (ret < 0) {
if (errno != EPERM)
return log_error_errno(-1, errno, "Failed to create device node \"%s\"", path);
return log_error_errno(-1, errno, "Failed to create device node \"%s\"", device->name);
use_mknod = LXC_DEVNODE_BIND;
}
......@@ -1178,19 +1172,19 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
continue;
if (use_mknod == LXC_DEVNODE_MKNOD) {
__do_close int fd = -EBADF;
/* See
* - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
* - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
*/
ret = open(path, O_RDONLY | O_CLOEXEC);
if (ret >= 0) {
close_prot_errno_disarm(ret);
fd = openat(dev_dir_fd, device->name, O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
/* Device nodes are fully useable. */
use_mknod = LXC_DEVNODE_OPEN;
continue;
}
SYSTRACE("Failed to open \"%s\" device", path);
SYSTRACE("Failed to open \"%s\" device", device->name);
/* Device nodes are only partially useable. */
use_mknod = LXC_DEVNODE_PARTIAL;
}
......@@ -1201,22 +1195,25 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
* nodes the prio mknod() call will have created the
* device node so we can use it as a bind-mount target.
*/
ret = mknod(path, S_IFREG | 0000, 0);
ret = mknodat(dev_dir_fd, device->name, S_IFREG | 0000, 0);
if (ret < 0 && errno != EEXIST)
return log_error_errno(-1, errno, "Failed to create file \"%s\"", path);
return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
}
/* Fallback to bind-mounting the device from the host. */
ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
if (ret < 0 || ret >= PATH_MAX)
return -1;
snprintf(hostpath, sizeof(hostpath), "/dev/%s", device->name);
ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
rootfs->path ? rootfs->mount : NULL);
ret = safe_mount_beneath_at(dev_dir_fd, hostpath, device->name, NULL, MS_BIND, NULL);
if (ret < 0) {
const char *mntpt = rootfs->path ? rootfs->mount : NULL;
if (errno == ENOSYS) {
snprintf(path, sizeof(path), "%s/dev/%s", mntpt, device->name);
ret = safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL);
}
}
if (ret < 0)
return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" onto \"%s\"",
hostpath, path);
DEBUG("Bind mounted host device node \"%s\" onto \"%s\"", hostpath, path);
return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" onto \"%s\"", hostpath, device->name);
DEBUG("Bind mounted host device node \"%s\" onto \"%s\"", hostpath, device->name);
}
(void)umask(cmask);
......@@ -1228,13 +1225,17 @@ static int lxc_mount_rootfs(struct lxc_conf *conf)
{
int ret;
struct lxc_storage *bdev;
const struct lxc_rootfs *rootfs = &conf->rootfs;
struct lxc_rootfs *rootfs = &conf->rootfs;
if (!rootfs->path) {
ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
if (ret < 0)
return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
rootfs->mntpt_fd = openat(-1, "/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
if (rootfs->mntpt_fd < 0)
return -errno;
return 0;
}
......@@ -1260,6 +1261,10 @@ static int lxc_mount_rootfs(struct lxc_conf *conf)
rootfs->path, rootfs->mount,
rootfs->options ? rootfs->options : "(null)");
rootfs->mntpt_fd = openat(-1, rootfs->mount, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
if (rootfs->mntpt_fd < 0)
return -errno;
return 0;
}
......@@ -2575,6 +2580,7 @@ struct lxc_conf *lxc_conf_init(void)
return NULL;
}
new->rootfs.managed = true;
new->rootfs.mntpt_fd = -EBADF;
new->logfd = -1;
lxc_list_init(&new->cgroup);
lxc_list_init(&new->cgroup2);
......@@ -3372,6 +3378,7 @@ int lxc_setup(struct lxc_handler *handler)
return log_error(-1, "Failed to drop capabilities");
}
close_prot_errno_disarm(lxc_conf->rootfs.mntpt_fd);
NOTICE("The container \"%s\" is set up", name);
return 0;
......@@ -3735,6 +3742,7 @@ void lxc_conf_free(struct lxc_conf *conf)
free(conf->rootfs.options);
free(conf->rootfs.path);
free(conf->rootfs.data);
close_prot_errno_disarm(conf->rootfs.mntpt_fd);
free(conf->logfile);
if (conf->logfd != -1)
close(conf->logfd);
......
......@@ -146,8 +146,10 @@ struct lxc_tty_info {
* @mountflags : the portion of @options that are flags
* @data : the portion of @options that are not flags
* @managed : whether it is managed by LXC
* @mntpt_fd : fd for @mount
*/
struct lxc_rootfs {
int mntpt_fd;
char *path;
char *mount;
char *bdev_type;
......
......@@ -539,3 +539,15 @@ int timens_offset_write(clockid_t clk_id, int64_t s_offset, int64_t ns_offset)
return 0;
}
bool exists_dir_at(int dir_fd, const char *path)
{
struct stat sb;
int ret;
ret = fstatat(dir_fd, path, &sb, 0);
if (ret < 0)
return false;
return S_ISDIR(sb.st_mode);
}
......@@ -73,5 +73,6 @@ __hidden extern int lxc_open_dirfd(const char *dir);
__hidden extern FILE *fdopen_cached(int fd, const char *mode, void **caller_freed_buffer);
__hidden extern FILE *fopen_cached(const char *path, const char *mode, void **caller_freed_buffer);
__hidden extern int timens_offset_write(clockid_t clk_id, int64_t s_offset, int64_t ns_offset);
__hidden extern bool exists_dir_at(int dir_fd, const char *path);
#endif /* __LXC_FILE_UTILS_H */
......@@ -640,4 +640,24 @@
#endif
#endif
#ifndef __NR_openat2
#if defined __alpha__
#define __NR_openat2 547
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_openat2 4437
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_openat2 6437
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_openat2 5437
#endif
#elif defined __ia64__
#define __NR_openat2 (437 + 1024)
#else
#define __NR_openat2 437
#endif
#endif
#endif /* __LXC_SYSCALL_NUMBERS_H */
......@@ -16,6 +16,7 @@
#include <unistd.h>
#include "config.h"
#include "macro.h"
#include "syscall_numbers.h"
#ifdef HAVE_LINUX_MEMFD_H
......@@ -26,6 +27,10 @@
#include <sys/signalfd.h>
#endif
#ifdef HAVE_STRUCT_OPEN_HOW
#include <linux/openat2.h>
#endif
typedef int32_t key_serial_t;
#if !HAVE_KEYCTL
......@@ -203,4 +208,61 @@ static inline int fsmount_lxc(int fs_fd, unsigned int flags, unsigned int attr_f
extern int fsmount(int fs_fd, unsigned int flags, unsigned int attr_flags);
#endif
/*
* Arguments for how openat2(2) should open the target path. If only @flags and
* @mode are non-zero, then openat2(2) operates very similarly to openat(2).
*
* However, unlike openat(2), unknown or invalid bits in @flags result in
* -EINVAL rather than being silently ignored. @mode must be zero unless one of
* {O_CREAT, O_TMPFILE} are set.
*
* @flags: O_* flags.
* @mode: O_CREAT/O_TMPFILE file mode.
* @resolve: RESOLVE_* flags.
*/
struct lxc_open_how {
__u64 flags;
__u64 mode;
__u64 resolve;
};
/* how->resolve flags for openat2(2). */
#ifndef RESOLVE_NO_XDEV
#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings
(includes bind-mounts). */
#endif
#ifndef RESOLVE_NO_MAGICLINKS
#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style
"magic-links". */
#endif
#ifndef RESOLVE_NO_SYMLINKS
#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks
(implies OEXT_NO_MAGICLINKS) */
#endif
#ifndef RESOLVE_BENEATH
#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like
"..", symlinks, and absolute
paths which escape the dirfd. */
#endif
#ifndef RESOLVE_IN_ROOT
#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
be scoped inside the dirfd
(similar to chroot(2)). */
#endif
#ifndef HAVE_OPENAT2
static inline int openat2(int dfd, const char *filename, struct lxc_open_how *how, size_t size)
{
/* When struct open_how is updated we should update lxc as well. */
#ifdef HAVE_STRUCT_OPEN_HOW
BUILD_BUG_ON(sizeof(struct lxc_open_how) != sizeof(struct open_how));
#endif
return syscall(__NR_openat2, dfd, filename, (struct open_how *)how, size);
}
#endif /* HAVE_OPENAT2 */
#endif /* __LXC_SYSCALL_WRAPPER_H */
......@@ -569,15 +569,7 @@ gid_t get_ns_gid(gid_t orig)
bool dir_exists(const char *path)
{
struct stat sb;
int ret;
ret = stat(path, &sb);
if (ret < 0)
/* Could be something other than eexist, just say "no". */
return false;
return S_ISDIR(sb.st_mode);
return exists_dir_at(-1, path);
}
/* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
......@@ -1079,6 +1071,61 @@ out:
return dirfd;
}
int __safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
unsigned int flags, const void *data)
{
__do_close int source_fd = -EBADF, target_fd = -EBADF;
struct lxc_open_how how = {
.flags = O_RDONLY | O_CLOEXEC | O_PATH,
.resolve = RESOLVE_NO_XDEV | RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS | RESOLVE_BENEATH,
};
int ret;
char src_buf[LXC_PROC_PID_FD_LEN], tgt_buf[LXC_PROC_PID_FD_LEN];
if (beneath_fd < 0)
return -EINVAL;
if ((flags & MS_BIND) && src && src[0] != '/') {
source_fd = openat2(beneath_fd, src, &how, sizeof(how));
if (source_fd < 0)
return -errno;
snprintf(src_buf, sizeof(src_buf), "/proc/self/fd/%d", source_fd);
} else {
src_buf[0] = '\0';
}
target_fd = openat2(beneath_fd, dst, &how, sizeof(how));
if (target_fd < 0)
return -errno;
snprintf(tgt_buf, sizeof(tgt_buf), "/proc/self/fd/%d", target_fd);
if (!is_empty_string(src_buf))
ret = mount(src_buf, tgt_buf, fstype, flags, data);
else
ret = mount(src, tgt_buf, fstype, flags, data);
return ret;
}
int safe_mount_beneath(const char *beneath, const char *src, const char *dst, const char *fstype,
unsigned int flags, const void *data)
{
__do_close int beneath_fd = -EBADF;
const char *path = beneath ? beneath : "/";
beneath_fd = openat(-1, beneath, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
if (beneath_fd < 0)
return log_error_errno(-errno, errno, "Failed to open %s", path);
return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
}
int safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
unsigned int flags, const void *data)
{
return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
}
/*
* Safely mount a path into a container, ensuring that the mount target
* is under the container's @rootfs. (If @rootfs is NULL, then the container
......
......@@ -246,4 +246,9 @@ static inline bool gid_valid(gid_t gid)
__hidden extern bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res);
__hidden extern int safe_mount_beneath(const char *beneath, const char *src, const char *dst,
const char *fstype, unsigned int flags, const void *data);
__hidden extern int safe_mount_beneath_at(int beneat_fd, const char *src, const char *dst,
const char *fstype, unsigned int flags, const void *data);
#endif /* __LXC_UTILS_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment