Unverified Commit c11af973 by Stéphane Graber Committed by GitHub

Merge pull request #3709 from brauner/2021-03-17/idmapped_mounts_v2

Initial support for idmapped mounts
parents 12cf9f5a fa8e75f0
...@@ -652,8 +652,9 @@ AC_CHECK_HEADER([ifaddrs.h], ...@@ -652,8 +652,9 @@ AC_CHECK_HEADER([ifaddrs.h],
AC_HEADER_MAJOR AC_HEADER_MAJOR
# Check for some syscalls functions # Check for some syscalls functions
AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3 fsopen fspick fsconfig fsmount, openat2, close_range, statvfs]) AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3 fsopen fspick fsconfig fsmount, openat2, close_range, statvfs, mount_setattr])
AC_CHECK_TYPES([__aligned_u64], [], [], [[#include <linux/types.h>]]) AC_CHECK_TYPES([__aligned_u64], [], [], [[#include <linux/types.h>]])
AC_CHECK_TYPES([struct mount_attr], [], [], [[#include <linux/mount.h>]])
AC_CHECK_TYPES([struct open_how], [], [], [[#include <linux/openat2.h>]]) AC_CHECK_TYPES([struct open_how], [], [], [[#include <linux/openat2.h>]])
AC_CHECK_TYPES([struct clone_args], [], [], [[#include <linux/sched.h>]]) AC_CHECK_TYPES([struct clone_args], [], [], [[#include <linux/sched.h>]])
AC_CHECK_MEMBERS([struct clone_args.set_tid],[],[],[[#include <linux/sched.h>]]) AC_CHECK_MEMBERS([struct clone_args.set_tid],[],[],[[#include <linux/sched.h>]])
......
...@@ -181,10 +181,12 @@ static struct attach_context *alloc_attach_context(void) ...@@ -181,10 +181,12 @@ static struct attach_context *alloc_attach_context(void)
if (!ctx) if (!ctx)
return ret_set_errno(NULL, ENOMEM); return ret_set_errno(NULL, ENOMEM);
ctx->init_pid = -ESRCH;
ctx->dfd_self_pid = -EBADF; ctx->dfd_self_pid = -EBADF;
ctx->dfd_init_pid = -EBADF; ctx->dfd_init_pid = -EBADF;
ctx->init_pidfd = -EBADF; ctx->init_pidfd = -EBADF;
ctx->init_pid = -ESRCH;
ctx->setup_ns_uid = LXC_INVALID_UID; ctx->setup_ns_uid = LXC_INVALID_UID;
ctx->setup_ns_gid = LXC_INVALID_GID; ctx->setup_ns_gid = LXC_INVALID_GID;
ctx->target_ns_uid = LXC_INVALID_UID; ctx->target_ns_uid = LXC_INVALID_UID;
...@@ -192,7 +194,7 @@ static struct attach_context *alloc_attach_context(void) ...@@ -192,7 +194,7 @@ static struct attach_context *alloc_attach_context(void)
ctx->target_host_uid = LXC_INVALID_UID; ctx->target_host_uid = LXC_INVALID_UID;
ctx->target_host_gid = LXC_INVALID_GID; ctx->target_host_gid = LXC_INVALID_GID;
for (int i = 0; i < LXC_NS_MAX; i++) for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++)
ctx->ns_fd[i] = -EBADF; ctx->ns_fd[i] = -EBADF;
return ctx; return ctx;
...@@ -436,7 +438,7 @@ static int get_attach_context(struct attach_context *ctx, ...@@ -436,7 +438,7 @@ static int get_attach_context(struct attach_context *ctx,
if (options->namespaces == -1) if (options->namespaces == -1)
return log_error_errno(-EINVAL, EINVAL, "Failed to automatically determine the namespaces which the container uses"); return log_error_errno(-EINVAL, EINVAL, "Failed to automatically determine the namespaces which the container uses");
for (int i = 0; i < LXC_NS_MAX; i++) { for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
if (ns_info[i].clone_flag & CLONE_NEWCGROUP) if (ns_info[i].clone_flag & CLONE_NEWCGROUP)
if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) || if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) ||
!cgns_supported()) !cgns_supported())
...@@ -531,7 +533,7 @@ static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path) ...@@ -531,7 +533,7 @@ static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path)
static int __prepare_namespaces_pidfd(struct attach_context *ctx) static int __prepare_namespaces_pidfd(struct attach_context *ctx)
{ {
for (int i = 0; i < LXC_NS_MAX; i++) { for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
int ret; int ret;
ret = same_nsfd(ctx->dfd_self_pid, ret = same_nsfd(ctx->dfd_self_pid,
...@@ -559,8 +561,8 @@ static int __prepare_namespaces_pidfd(struct attach_context *ctx) ...@@ -559,8 +561,8 @@ static int __prepare_namespaces_pidfd(struct attach_context *ctx)
static int __prepare_namespaces_nsfd(struct attach_context *ctx, static int __prepare_namespaces_nsfd(struct attach_context *ctx,
lxc_attach_options_t *options) lxc_attach_options_t *options)
{ {
for (int i = 0; i < LXC_NS_MAX; i++) { for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
int j; lxc_namespace_t j;
if (options->namespaces & ns_info[i].clone_flag) if (options->namespaces & ns_info[i].clone_flag)
ctx->ns_fd[i] = open_at(ctx->dfd_init_pid, ctx->ns_fd[i] = open_at(ctx->dfd_init_pid,
...@@ -642,7 +644,7 @@ static int __attach_namespaces_nsfd(struct attach_context *ctx, ...@@ -642,7 +644,7 @@ static int __attach_namespaces_nsfd(struct attach_context *ctx,
{ {
int fret = 0; int fret = 0;
for (int i = 0; i < LXC_NS_MAX; i++) { for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
int ret; int ret;
if (ctx->ns_fd[i] < 0) if (ctx->ns_fd[i] < 0)
...@@ -670,7 +672,7 @@ static int attach_namespaces(struct attach_context *ctx, ...@@ -670,7 +672,7 @@ static int attach_namespaces(struct attach_context *ctx,
lxc_attach_options_t *options) lxc_attach_options_t *options)
{ {
if (lxc_log_trace()) { if (lxc_log_trace()) {
for (int i = 0; i < LXC_NS_MAX; i++) { for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
if (ns_info[i].clone_flag & options->namespaces) { if (ns_info[i].clone_flag & options->namespaces) {
TRACE("Attaching to %s namespace", ns_info[i].proc_name); TRACE("Attaching to %s namespace", ns_info[i].proc_name);
continue; continue;
......
...@@ -98,6 +98,10 @@ ...@@ -98,6 +98,10 @@
#include <../include/prlimit.h> #include <../include/prlimit.h>
#endif #endif
#ifndef HAVE_STRLCPY
#include "include/strlcpy.h"
#endif
lxc_log_define(conf, lxc); lxc_log_define(conf, lxc);
/* /*
...@@ -484,11 +488,24 @@ int run_script(const char *name, const char *section, const char *script, ...) ...@@ -484,11 +488,24 @@ int run_script(const char *name, const char *section, const char *script, ...)
*/ */
int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns) int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
{ {
__do_close int dfd_path = -EBADF, fd_pin = -EBADF; __do_close int dfd_path = -EBADF, fd_pin = -EBADF, fd_userns = -EBADF;
int ret; int ret;
struct stat st; struct stat st;
struct statfs stfs; struct statfs stfs;
if (!is_empty_string(rootfs->mnt_opts.userns_path)) {
if (!rootfs->path)
return syserror_set(-EINVAL, "Idmapped rootfs currently only supported with separate rootfs for container");
if (rootfs->bdev_type && !strequal(rootfs->bdev_type, "dir"))
return syserror_set(-EINVAL, "Idmapped rootfs currently only supports the \"dir\" storage driver");
fd_userns = open_at(-EBADF, rootfs->mnt_opts.userns_path,
PROTECT_OPEN_WITH_TRAILING_SYMLINKS, 0, 0);
if (fd_userns < 0)
return syserror("Failed to open user namespace");
}
if (rootfs->path) { if (rootfs->path) {
if (rootfs->bdev_type && if (rootfs->bdev_type &&
(strequal(rootfs->bdev_type, "overlay") || (strequal(rootfs->bdev_type, "overlay") ||
...@@ -500,13 +517,17 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns) ...@@ -500,13 +517,17 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
dfd_path = open_at(-EBADF, "/", PROTECT_OPATH_FILE, PROTECT_LOOKUP_ABSOLUTE, 0); dfd_path = open_at(-EBADF, "/", PROTECT_OPATH_FILE, PROTECT_LOOKUP_ABSOLUTE, 0);
} }
if (dfd_path < 0) if (dfd_path < 0)
return log_error_errno(-errno, errno, "Failed to open \"%s\"", rootfs->path); return syserror("Failed to open \"%s\"", rootfs->path);
if (!rootfs->path) if (!rootfs->path) {
return log_trace(0, "Not pinning because container does not have a rootfs"); TRACE("Not pinning because container does not have a rootfs");
goto out;
}
if (userns) if (userns) {
return log_trace(0, "Not pinning because container runs in user namespace"); TRACE("Not pinning because container runs in user namespace");
goto out;
}
ret = fstat(dfd_path, &st); ret = fstat(dfd_path, &st);
if (ret < 0) if (ret < 0)
...@@ -520,7 +541,7 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns) ...@@ -520,7 +541,7 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
PROTECT_LOOKUP_BENEATH, PROTECT_LOOKUP_BENEATH,
S_IWUSR | S_IRUSR); S_IWUSR | S_IRUSR);
if (fd_pin < 0) if (fd_pin < 0)
return log_error_errno(-errno, errno, "Failed to pin rootfs"); return syserror("Failed to pin rootfs");
TRACE("Pinned rootfs %d(.lxc_keep)", fd_pin); TRACE("Pinned rootfs %d(.lxc_keep)", fd_pin);
...@@ -542,6 +563,7 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns) ...@@ -542,6 +563,7 @@ int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
out: out:
rootfs->fd_path_pin = move_fd(fd_pin); rootfs->fd_path_pin = move_fd(fd_pin);
rootfs->mnt_opts.userns_fd = move_fd(fd_userns);
return 0; return 0;
} }
...@@ -2090,34 +2112,70 @@ skipremount: ...@@ -2090,34 +2112,70 @@ skipremount:
return 0; return 0;
} }
const char *lxc_mount_options_info[LXC_MOUNT_MAX] = {
"create=dir",
"create=file",
"optional",
"relative",
"idmap=",
};
/* Remove "optional", "create=dir", and "create=file" from mntopt */ /* Remove "optional", "create=dir", and "create=file" from mntopt */
static void cull_mntent_opt(struct mntent *mntent) int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts)
{ {
int i; for (size_t i = LXC_MOUNT_CREATE_DIR; i < LXC_MOUNT_MAX; i++) {
char *list[] = { __do_close int fd_userns = -EBADF;
"create=dir", const char *opt_name = lxc_mount_options_info[i];
"create=file", size_t len;
"optional", char *idmap_path, *p, *p2;
"relative",
NULL
};
for (i = 0; list[i]; i++) {
char *p, *p2;
p = strstr(mntent->mnt_opts, list[i]); p = strstr(mnt_opts, opt_name);
if (!p) if (!p)
continue; continue;
p2 = strchr(p, ','); switch (i) {
if (!p2) { case LXC_MOUNT_CREATE_DIR:
/* no more mntopts, so just chop it here */ opts->create_dir = 1;
*p = '\0'; break;
continue; case LXC_MOUNT_CREATE_FILE:
opts->create_file = 1;
break;
case LXC_MOUNT_OPTIONAL:
opts->optional = 1;
break;
case LXC_MOUNT_RELATIVE:
opts->relative = 1;
break;
case LXC_MOUNT_IDMAP:
p2 = p;
p2 += STRLITERALLEN("idmap=");
idmap_path = strchrnul(p2, ',');
len = strlcpy(opts->userns_path, p2, idmap_path - p2 + 1);
if (len >= sizeof(opts->userns_path))
return syserror_set(-EIO, "Excessive idmap path length for \"idmap=<path>\" LXC specific mount option");
if (is_empty_string(opts->userns_path))
return syserror_set(-EINVAL, "Missing idmap path for \"idmap=<path>\" LXC specific mount option");
fd_userns = open(opts->userns_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
if (fd_userns < 0)
return syserror("Failed to open user namespace");
TRACE("Parse LXC specific mount option %d->\"idmap=%s\"", fd_userns, opts->userns_path);
break;
default:
return syserror_set(-EINVAL, "Unknown LXC specific mount option");
} }
memmove(p, p2 + 1, strlen(p2 + 1) + 1); p2 = strchr(p, ',');
if (!p2)
*p = '\0'; /* no more mntopts, so just chop it here */
else
memmove(p, p2 + 1, strlen(p2 + 1) + 1);
} }
return 0;
} }
static int mount_entry_create_dir_file(const struct mntent *mntent, static int mount_entry_create_dir_file(const struct mntent *mntent,
...@@ -2178,6 +2236,7 @@ static inline int mount_entry_on_generic(struct mntent *mntent, ...@@ -2178,6 +2236,7 @@ static inline int mount_entry_on_generic(struct mntent *mntent,
char *rootfs_path = NULL; char *rootfs_path = NULL;
int ret; int ret;
bool dev, optional, relative; bool dev, optional, relative;
struct lxc_mount_options opts = {};
optional = hasmntopt(mntent, "optional") != NULL; optional = hasmntopt(mntent, "optional") != NULL;
dev = hasmntopt(mntent, "dev") != NULL; dev = hasmntopt(mntent, "dev") != NULL;
...@@ -2194,7 +2253,13 @@ static inline int mount_entry_on_generic(struct mntent *mntent, ...@@ -2194,7 +2253,13 @@ static inline int mount_entry_on_generic(struct mntent *mntent,
return -1; return -1;
} }
cull_mntent_opt(mntent);
ret = parse_lxc_mntopts(&opts, mntent->mnt_opts);
if (ret < 0)
return ret;
if (!is_empty_string(opts.userns_path))
return syserror_set(-EINVAL, "Idmapped mount entries not yet supported");
ret = parse_propagationopts(mntent->mnt_opts, &pflags); ret = parse_propagationopts(mntent->mnt_opts, &pflags);
if (ret < 0) if (ret < 0)
...@@ -2686,6 +2751,7 @@ struct lxc_conf *lxc_conf_init(void) ...@@ -2686,6 +2751,7 @@ struct lxc_conf *lxc_conf_init(void)
new->rootfs.dfd_dev = -EBADF; new->rootfs.dfd_dev = -EBADF;
new->rootfs.dfd_host = -EBADF; new->rootfs.dfd_host = -EBADF;
new->rootfs.fd_path_pin = -EBADF; new->rootfs.fd_path_pin = -EBADF;
new->rootfs.mnt_opts.userns_fd = -EBADF;
new->logfd = -1; new->logfd = -1;
lxc_list_init(&new->cgroup); lxc_list_init(&new->cgroup);
lxc_list_init(&new->cgroup2); lxc_list_init(&new->cgroup2);
......
...@@ -181,6 +181,26 @@ struct lxc_tty_info { ...@@ -181,6 +181,26 @@ struct lxc_tty_info {
struct lxc_terminal_info *tty; struct lxc_terminal_info *tty;
}; };
typedef enum lxc_mount_options_t {
LXC_MOUNT_CREATE_DIR = 0,
LXC_MOUNT_CREATE_FILE = 1,
LXC_MOUNT_OPTIONAL = 2,
LXC_MOUNT_RELATIVE = 3,
LXC_MOUNT_IDMAP = 4,
LXC_MOUNT_MAX = 5,
} lxc_mount_options_t;
__hidden extern const char *lxc_mount_options_info[LXC_MOUNT_MAX];
struct lxc_mount_options {
int create_dir : 1;
int create_file : 1;
int optional : 1;
int relative : 1;
char userns_path[PATH_MAX];
int userns_fd;
};
/* Defines a structure to store the rootfs location, the /* Defines a structure to store the rootfs location, the
* optionals pivot_root, rootfs mount paths * optionals pivot_root, rootfs mount paths
* @path : the rootfs source (directory or device) * @path : the rootfs source (directory or device)
...@@ -211,6 +231,7 @@ struct lxc_rootfs { ...@@ -211,6 +231,7 @@ struct lxc_rootfs {
unsigned long mountflags; unsigned long mountflags;
char *data; char *data;
bool managed; bool managed;
struct lxc_mount_options mnt_opts;
}; };
/* /*
...@@ -509,6 +530,7 @@ __hidden extern int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), v ...@@ -509,6 +530,7 @@ __hidden extern int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), v
const char *fn_name); const char *fn_name);
__hidden extern int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata); __hidden extern int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata);
__hidden extern int parse_propagationopts(const char *mntopts, unsigned long *pflags); __hidden extern int parse_propagationopts(const char *mntopts, unsigned long *pflags);
__hidden extern int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts);
__hidden extern void tmp_proc_unmount(struct lxc_conf *lxc_conf); __hidden extern void tmp_proc_unmount(struct lxc_conf *lxc_conf);
__hidden extern void suggest_default_idmap(void); __hidden extern void suggest_default_idmap(void);
__hidden extern FILE *make_anonymous_mount_file(struct lxc_list *mount, bool include_nesting_helpers); __hidden extern FILE *make_anonymous_mount_file(struct lxc_list *mount, bool include_nesting_helpers);
...@@ -554,12 +576,18 @@ static inline const char *get_rootfs_mnt(const struct lxc_rootfs *rootfs) ...@@ -554,12 +576,18 @@ static inline const char *get_rootfs_mnt(const struct lxc_rootfs *rootfs)
return !is_empty_string(rootfs->path) ? rootfs->mount : s; return !is_empty_string(rootfs->path) ? rootfs->mount : s;
} }
static inline bool idmapped_rootfs_mnt(const struct lxc_rootfs *rootfs)
{
return rootfs->mnt_opts.userns_fd >= 0;
}
static inline void put_lxc_rootfs(struct lxc_rootfs *rootfs, bool unpin) static inline void put_lxc_rootfs(struct lxc_rootfs *rootfs, bool unpin)
{ {
if (rootfs) { if (rootfs) {
close_prot_errno_disarm(rootfs->dfd_host); close_prot_errno_disarm(rootfs->dfd_host);
close_prot_errno_disarm(rootfs->dfd_mnt); close_prot_errno_disarm(rootfs->dfd_mnt);
close_prot_errno_disarm(rootfs->dfd_dev); close_prot_errno_disarm(rootfs->dfd_dev);
close_prot_errno_disarm(rootfs->mnt_opts.userns_fd);
if (unpin) if (unpin)
close_prot_errno_disarm(rootfs->fd_path_pin); close_prot_errno_disarm(rootfs->fd_path_pin);
} }
......
...@@ -2790,7 +2790,7 @@ static int set_config_rootfs_mount(const char *key, const char *value, ...@@ -2790,7 +2790,7 @@ static int set_config_rootfs_mount(const char *key, const char *value,
static int set_config_rootfs_options(const char *key, const char *value, static int set_config_rootfs_options(const char *key, const char *value,
struct lxc_conf *lxc_conf, void *data) struct lxc_conf *lxc_conf, void *data)
{ {
__do_free char *mdata = NULL, *opts = NULL; __do_free char *dup = NULL, *mdata = NULL, *opts = NULL;
unsigned long mflags = 0, pflags = 0; unsigned long mflags = 0, pflags = 0;
struct lxc_rootfs *rootfs = &lxc_conf->rootfs; struct lxc_rootfs *rootfs = &lxc_conf->rootfs;
int ret; int ret;
...@@ -2799,18 +2799,30 @@ static int set_config_rootfs_options(const char *key, const char *value, ...@@ -2799,18 +2799,30 @@ static int set_config_rootfs_options(const char *key, const char *value,
if (lxc_config_value_empty(value)) if (lxc_config_value_empty(value))
return 0; return 0;
ret = parse_mntopts(value, &mflags, &mdata); dup = strdup(value);
if (!dup)
return -ENOMEM;
ret = parse_lxc_mntopts(&rootfs->mnt_opts, dup);
if (ret < 0)
return ret;
ret = parse_mntopts(dup, &mflags, &mdata);
if (ret < 0) if (ret < 0)
return ret_errno(EINVAL); return ret_errno(EINVAL);
ret = parse_propagationopts(value, &pflags); ret = parse_propagationopts(dup, &pflags);
if (ret < 0) if (ret < 0)
return ret_errno(EINVAL); return ret_errno(EINVAL);
ret = set_config_string_item(&opts, value); ret = set_config_string_item(&opts, dup);
if (ret < 0) if (ret < 0)
return ret_errno(ENOMEM); return ret_errno(ENOMEM);
if (rootfs->mnt_opts.create_dir || rootfs->mnt_opts.create_file ||
rootfs->mnt_opts.optional || rootfs->mnt_opts.relative)
return syserror_set(-EINVAL, "Invalid LXC specifc mount option for rootfs mount");
rootfs->mountflags = mflags | pflags; rootfs->mountflags = mflags | pflags;
rootfs->options = move_ptr(opts); rootfs->options = move_ptr(opts);
rootfs->data = move_ptr(mdata); rootfs->data = move_ptr(mdata);
......
...@@ -1165,15 +1165,15 @@ static int apparmor_process_label_fd_get(struct lsm_ops *ops, pid_t pid, bool on ...@@ -1165,15 +1165,15 @@ static int apparmor_process_label_fd_get(struct lsm_ops *ops, pid_t pid, bool on
static int apparmor_process_label_set_at(struct lsm_ops *ops, int label_fd, const char *label, bool on_exec) static int apparmor_process_label_set_at(struct lsm_ops *ops, int label_fd, const char *label, bool on_exec)
{ {
__do_free char *command = NULL;
int ret = -1; int ret = -1;
size_t len; size_t len;
__do_free char *command = NULL;
if (on_exec) if (on_exec)
log_trace(0, "Changing AppArmor profile on exec not supported"); TRACE("Changing AppArmor profile on exec not supported");
len = strlen(label) + strlen("changeprofile ") + 1; len = strlen(label) + strlen("changeprofile ") + 1;
command = malloc(len); command = zalloc(len);
if (!command) if (!command)
return ret_errno(ENOMEM); return ret_errno(ENOMEM);
......
...@@ -236,16 +236,76 @@ int fs_attach(int fd_fs, ...@@ -236,16 +236,76 @@ int fs_attach(int fd_fs,
return 0; return 0;
} }
int fd_bind_mount(int dfd_from, const char *path_from, int create_detached_idmapped_mount(const char *path, int userns_fd, bool recursive)
__u64 o_flags_from, __u64 resolve_flags_from, {
int dfd_to, const char *path_to, __do_close int fd_tree_from = -EBADF;
__u64 o_flags_to, __u64 resolve_flags_to, unsigned int open_tree_flags = OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC;
unsigned int attr_flags, bool recursive) struct lxc_mount_attr attr = {
.attr_set = MOUNT_ATTR_IDMAP,
.userns_fd = userns_fd,
};
int ret;
TRACE("Idmapped mount \"%s\" requested with user namespace fd %d", path, userns_fd);
if (recursive)
open_tree_flags |= AT_RECURSIVE;
fd_tree_from = open_tree(-EBADF, path, open_tree_flags);
if (fd_tree_from < 0)
return syserror("Failed to create detached mount");
ret = mount_setattr(fd_tree_from, "",
AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0),
&attr, sizeof(attr));
if (ret < 0)
return syserror("Failed to change mount attributes");
return move_fd(fd_tree_from);
}
int move_detached_mount(int dfd_from, int dfd_to, const char *path_to,
__u64 o_flags_to, __u64 resolve_flags_to)
{
__do_close int __fd_to = -EBADF;
int fd_to, ret;
if (!is_empty_string(path_to)) {
struct lxc_open_how how = {
.flags = o_flags_to,
.resolve = resolve_flags_to,
};
__fd_to = openat2(dfd_to, path_to, &how, sizeof(how));
if (__fd_to < 0)
return -errno;
fd_to = __fd_to;
} else {
fd_to = dfd_to;
}
ret = move_mount(dfd_from, "", fd_to, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH);
if (ret)
return syserror("Failed to attach detached mount %d to filesystem at %d", dfd_from, fd_to);
TRACE("Attach detached mount %d to filesystem at %d", dfd_from, fd_to);
return 0;
}
static int __fd_bind_mount(int dfd_from, const char *path_from,
__u64 o_flags_from, __u64 resolve_flags_from,
int dfd_to, const char *path_to, __u64 o_flags_to,
__u64 resolve_flags_to, unsigned int attr_flags,
int userns_fd, bool recursive)
{ {
__do_close int __fd_from = -EBADF, __fd_to = -EBADF; struct lxc_mount_attr attr = {
.attr_set = attr_flags,
};
__do_close int __fd_from = -EBADF;
__do_close int fd_tree_from = -EBADF; __do_close int fd_tree_from = -EBADF;
unsigned int open_tree_flags = AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC; unsigned int open_tree_flags = AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC;
int fd_from, fd_to, ret; int fd_from, ret;
if (!is_empty_string(path_from)) { if (!is_empty_string(path_from)) {
struct lxc_open_how how = { struct lxc_open_how how = {
...@@ -266,28 +326,46 @@ int fd_bind_mount(int dfd_from, const char *path_from, ...@@ -266,28 +326,46 @@ int fd_bind_mount(int dfd_from, const char *path_from,
fd_tree_from = open_tree(fd_from, "", open_tree_flags); fd_tree_from = open_tree(fd_from, "", open_tree_flags);
if (fd_tree_from < 0) if (fd_tree_from < 0)
return log_error_errno(-errno, errno, "Failed to create detached mount"); return syserror("Failed to create detached mount");
if (!is_empty_string(path_to)) { if (userns_fd >= 0) {
struct lxc_open_how how = { attr.attr_set |= MOUNT_ATTR_IDMAP;
.flags = o_flags_to, attr.userns_fd = userns_fd;
.resolve = resolve_flags_to, TRACE("Idmapped mount requested with user namespace fd %d", userns_fd);
}; }
__fd_to = openat2(dfd_to, path_to, &how, sizeof(how)); if (attr.attr_set) {
if (__fd_to < 0) ret = mount_setattr(fd_tree_from, "",
return -errno; AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0),
fd_to = __fd_to; &attr, sizeof(attr));
} else { if (ret < 0)
fd_to = dfd_to; return syserror("Failed to change mount attributes");
} }
ret = move_mount(fd_tree_from, "", fd_to, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH); return move_detached_mount(fd_tree_from, dfd_to, path_to, o_flags_to,
if (ret) resolve_flags_to);
return log_error_errno(-errno, errno, "Failed to attach detached mount %d to filesystem at %d", fd_tree_from, fd_to); }
TRACE("Attach detached mount %d to filesystem at %d", fd_tree_from, fd_to); int fd_mount_idmapped(int dfd_from, const char *path_from,
return 0; __u64 o_flags_from, __u64 resolve_flags_from,
int dfd_to, const char *path_to,
__u64 o_flags_to, __u64 resolve_flags_to,
unsigned int attr_flags, int userns_fd, bool recursive)
{
return __fd_bind_mount(dfd_from, path_from, o_flags_from, resolve_flags_from,
dfd_to, path_to, o_flags_to, resolve_flags_to,
attr_flags, userns_fd, recursive);
}
int fd_bind_mount(int dfd_from, const char *path_from,
__u64 o_flags_from, __u64 resolve_flags_from,
int dfd_to, const char *path_to,
__u64 o_flags_to, __u64 resolve_flags_to,
unsigned int attr_flags, bool recursive)
{
return __fd_bind_mount(dfd_from, path_from, o_flags_from, resolve_flags_from,
dfd_to, path_to, o_flags_to, resolve_flags_to,
attr_flags, -EBADF, recursive);
} }
int calc_remount_flags_new(int dfd_from, const char *path_from, int calc_remount_flags_new(int dfd_from, const char *path_from,
...@@ -488,3 +566,28 @@ bool can_use_mount_api(void) ...@@ -488,3 +566,28 @@ bool can_use_mount_api(void)
return supported == 1; return supported == 1;
} }
bool can_use_bind_mounts(void)
{
static int supported = -1;
if (supported == -1) {
int ret;
if (!can_use_mount_api()) {
supported = 0;
return false;
}
ret = mount_setattr(-EBADF, NULL, 0, NULL, 0);
if (!ret || errno == ENOSYS) {
supported = 0;
return false;
}
supported = 1;
TRACE("Kernel supports bind mounts in the new mount api");
}
return supported == 1;
}
...@@ -152,6 +152,10 @@ ...@@ -152,6 +152,10 @@
#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ #define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
#endif #endif
#ifndef MOUNT_ATTR_IDMAP
#define MOUNT_ATTR_IDMAP 0x00100000
#endif
__hidden extern int mnt_attributes_new(unsigned int old_flags, unsigned int *new_flags); __hidden extern int mnt_attributes_new(unsigned int old_flags, unsigned int *new_flags);
__hidden extern int mnt_attributes_old(unsigned int new_flags, unsigned int *old_flags); __hidden extern int mnt_attributes_old(unsigned int new_flags, unsigned int *old_flags);
...@@ -185,6 +189,18 @@ __hidden extern int fd_bind_mount(int dfd_from, const char *path_from, ...@@ -185,6 +189,18 @@ __hidden extern int fd_bind_mount(int dfd_from, const char *path_from,
__u64 o_flags_to, __u64 resolve_flags_to, __u64 o_flags_to, __u64 resolve_flags_to,
unsigned int attr_flags, bool recursive); unsigned int attr_flags, bool recursive);
__hidden extern int fd_mount_idmapped(int dfd_from, const char *path_from,
__u64 o_flags_from, __u64 resolve_flags_from,
int dfd_to, const char *path_to,
__u64 o_flags_to, __u64 resolve_flags_to,
unsigned int attr_flags, int userns_fd,
bool recursive);
__hidden extern int create_detached_idmapped_mount(const char *path,
int userns_fd, bool recursive);
__hidden extern int move_detached_mount(int dfd_from, int dfd_to,
const char *path_to, __u64 o_flags_to,
__u64 resolve_flags_to);
__hidden extern int calc_remount_flags_new(int dfd_from, const char *path_from, __hidden extern int calc_remount_flags_new(int dfd_from, const char *path_from,
__u64 o_flags_from, __u64 o_flags_from,
__u64 resolve_flags_from, __u64 resolve_flags_from,
...@@ -202,5 +218,6 @@ __hidden extern unsigned long add_required_remount_flags(const char *s, ...@@ -202,5 +218,6 @@ __hidden extern unsigned long add_required_remount_flags(const char *s,
unsigned long flags); unsigned long flags);
__hidden extern bool can_use_mount_api(void); __hidden extern bool can_use_mount_api(void);
__hidden extern bool can_use_bind_mounts(void);
#endif /* __LXC_MOUNT_UTILS_H */ #endif /* __LXC_MOUNT_UTILS_H */
...@@ -1645,16 +1645,6 @@ static int lxc_spawn(struct lxc_handler *handler) ...@@ -1645,16 +1645,6 @@ static int lxc_spawn(struct lxc_handler *handler)
goto out_delete_net; goto out_delete_net;
} }
/* If the rootfs is not a blockdev, prevent the container from marking
* it readonly.
* If the container is unprivileged then skip rootfs pinning.
*/
ret = lxc_rootfs_prepare(&conf->rootfs, wants_to_map_ids);
if (ret) {
ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
goto out_delete_net;
}
/* Create a process in a new set of namespaces. */ /* Create a process in a new set of namespaces. */
if (share_ns) { if (share_ns) {
pid_t attacher_pid; pid_t attacher_pid;
...@@ -2040,9 +2030,34 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops, ...@@ -2040,9 +2030,34 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
goto out_abort; goto out_abort;
} }
/* If the rootfs is not a blockdev, prevent the container from marking
* it readonly.
* If the container is unprivileged then skip rootfs pinning.
*/
ret = lxc_rootfs_prepare(&conf->rootfs, !lxc_list_empty(&conf->id_map));
if (ret) {
ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
ret = -1;
goto out_abort;
}
if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) { if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
/* If the backing store is a device, mount it here and now. */ /*
if (rootfs_is_blockdev(conf)) { * This handles two cases: mounting real block devices and
* creating idmapped mounts. The block device case should be
* obivous, i.e. no real filesystem can currently be mounted
* from inside a user namespace.
*
* Idmapped mounts can currently only be created if the caller
* is privileged wrt to the user namespace in which the
* underlying block device has been mounted in. This basically
* (with few exceptions) means we need to be CAP_SYS_ADMIN in
* the initial user namespace since almost no interesting
* filesystems can be mounted inside of user namespaces. This
* is way we need to do the rootfs setup here. In the future
* this may change.
*/
if (idmapped_rootfs_mnt(&conf->rootfs) || rootfs_is_blockdev(conf)) {
ret = unshare(CLONE_NEWNS); ret = unshare(CLONE_NEWNS);
if (ret < 0) { if (ret < 0) {
ERROR("Failed to unshare CLONE_NEWNS"); ERROR("Failed to unshare CLONE_NEWNS");
......
...@@ -148,23 +148,46 @@ int dir_mount(struct lxc_storage *bdev) ...@@ -148,23 +148,46 @@ int dir_mount(struct lxc_storage *bdev)
src = lxc_storage_get_path(bdev->src, bdev->type); src = lxc_storage_get_path(bdev->src, bdev->type);
ret = mount(src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags | pflags, mntdata); if (can_use_bind_mounts()) {
if (ret < 0) __do_close int fd_source = -EBADF, fd_target = -EBADF;
return log_error_errno(-errno, errno, "Failed to mount \"%s\" on \"%s\"", src, bdev->dest);
fd_source = open_at(-EBADF, src, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
if (ret == 0 && (mntflags & MS_RDONLY)) { if (fd_source < 0)
mflags = add_required_remount_flags(src, bdev->dest, MS_BIND | MS_REC | mntflags | pflags | MS_REMOUNT); return syserror("Failed to open \"%s\"", src);
ret = mount(src, bdev->dest, "bind", mflags, mntdata);
fd_target = open_at(-EBADF, bdev->dest, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
if (fd_target < 0)
return syserror("Failed to open \"%s\"", bdev->dest);
ret = fd_mount_idmapped(fd_source, "", PROTECT_OPATH_DIRECTORY,
PROTECT_LOOKUP_BENEATH, fd_target, "",
PROTECT_OPATH_DIRECTORY,
PROTECT_LOOKUP_BENEATH, 0,
bdev->rootfs->mnt_opts.userns_fd, true);
if (ret < 0)
return syserror("Failed to mount \"%s\" onto \"%s\"", src, bdev->dest);
} else {
ret = mount(src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags | pflags, mntdata);
if (ret < 0) if (ret < 0)
return log_error_errno(-errno, errno, "Failed to remount \"%s\" on \"%s\" read-only with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"", return log_error_errno(-errno, errno, "Failed to mount \"%s\" on \"%s\"", src, bdev->dest);
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
else if (ret == 0 && (mntflags & MS_RDONLY)) {
DEBUG("Remounted \"%s\" on \"%s\" read-only with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"", mflags = add_required_remount_flags(src, bdev->dest, MS_BIND | MS_REC | mntflags | pflags | MS_REMOUNT);
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
ret = mount(src, bdev->dest, "bind", mflags, mntdata);
if (ret < 0)
return log_error_errno(-errno, errno, "Failed to remount \"%s\" on \"%s\" read-only with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"",
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
else
DEBUG("Remounted \"%s\" on \"%s\" read-only with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"",
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
}
TRACE("Mounted \"%s\" on \"%s\" with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"",
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
} }
TRACE("Mounted \"%s\" on \"%s\" with options \"%s\", mount flags \"%lu\", and propagation flags \"%lu\"", TRACE("Mounted \"%s\" onto \"%s\"", src, bdev->dest);
src ? src : "(none)", bdev->dest ? bdev->dest : "(none)", mntdata, mflags, pflags);
return 0; return 0;
} }
......
...@@ -598,14 +598,13 @@ struct lxc_storage *storage_init(struct lxc_conf *conf) ...@@ -598,14 +598,13 @@ struct lxc_storage *storage_init(struct lxc_conf *conf)
if (!q) if (!q)
return NULL; return NULL;
bdev = malloc(sizeof(struct lxc_storage)); bdev = zalloc(sizeof(struct lxc_storage));
if (!bdev) if (!bdev)
return NULL; return NULL;
memset(bdev, 0, sizeof(struct lxc_storage)); bdev->ops = q->ops;
bdev->type = q->name;
bdev->ops = q->ops; bdev->rootfs = &conf->rootfs;
bdev->type = q->name;
if (mntopts) if (mntopts)
bdev->mntopts = strdup(mntopts); bdev->mntopts = strdup(mntopts);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#endif #endif
#include "compiler.h" #include "compiler.h"
#include "conf.h"
#ifndef MS_DIRSYNC #ifndef MS_DIRSYNC
#define MS_DIRSYNC 128 #define MS_DIRSYNC 128
...@@ -87,6 +88,7 @@ struct lxc_storage { ...@@ -87,6 +88,7 @@ struct lxc_storage {
/* index for the connected nbd device. */ /* index for the connected nbd device. */
int nbd_idx; int nbd_idx;
int flags; int flags;
struct lxc_rootfs *rootfs;
}; };
/** /**
......
...@@ -680,4 +680,24 @@ ...@@ -680,4 +680,24 @@
#endif #endif
#endif #endif
#ifndef __NR_mount_setattr
#if defined __alpha__
#define __NR_mount_setattr 552
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_mount_setattr (442 + 4000)
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_mount_setattr (442 + 6000)
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_mount_setattr (442 + 5000)
#endif
#elif defined __ia64__
#define __NR_mount_setattr (442 + 1024)
#else
#define __NR_mount_setattr 442
#endif
#endif
#endif /* __LXC_SYSCALL_NUMBERS_H */ #endif /* __LXC_SYSCALL_NUMBERS_H */
...@@ -209,6 +209,24 @@ extern int fsmount(int fs_fd, unsigned int flags, unsigned int attr_flags); ...@@ -209,6 +209,24 @@ extern int fsmount(int fs_fd, unsigned int flags, unsigned int attr_flags);
#endif #endif
/* /*
* mount_setattr()
*/
struct lxc_mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u64 propagation;
__u64 userns_fd;
};
#ifndef HAVE_MOUNT_SETATTR
static inline int mount_setattr(int dfd, const char *path, unsigned int flags,
struct lxc_mount_attr *attr, size_t size)
{
return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
}
#endif
/*
* Arguments for how openat2(2) should open the target path. If only @flags and * Arguments for how openat2(2) should open the target path. If only @flags and
* @mode are non-zero, then openat2(2) operates very similarly to openat(2). * @mode are non-zero, then openat2(2) operates very similarly to openat(2).
* *
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment