Unverified Commit c0bdef23 by Serge Hallyn Committed by GitHub

Merge pull request #3412 from brauner/2020-05-15/clone3

clone3: add infrastructure and switch container creation to it
parents 748166a3 04a49a14
......@@ -622,7 +622,10 @@ AC_CHECK_HEADER([ifaddrs.h],
AC_HEADER_MAJOR
# Check for some syscalls functions
AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree])
AC_CHECK_FUNCS([setns pivot_root sethostname unshare rand_r confstr faccessat gettid memfd_create move_mount open_tree execveat clone3])
AC_CHECK_TYPES([struct clone_args], [], [], [[#include <linux/sched.h>]])
AC_CHECK_MEMBERS([struct clone_args.set_tid],[],[],[[#include <linux/sched.h>]])
AC_CHECK_MEMBERS([struct clone_args.cgroup],[],[],[[#include <linux/sched.h>]])
# Check for strerror_r() support. Defines:
# - HAVE_STRERROR_R if available
......@@ -761,7 +764,7 @@ AX_CHECK_COMPILE_FLAG([-Wstringop-overflow], [CFLAGS="$CFLAGS -Wstringop-overflo
AX_CHECK_LINK_FLAG([-z relro], [LDFLAGS="$LDFLAGS -z relro"],,[])
AX_CHECK_LINK_FLAG([-z now], [LDFLAGS="$LDFLAGS -z now"],,[])
CFLAGS="$CFLAGS -Wvla -std=gnu11"
CFLAGS="$CFLAGS -Wvla -std=gnu11 -fms-extensions"
if test "x$enable_werror" = "xyes"; then
CFLAGS="$CFLAGS -Werror"
fi
......
......@@ -29,7 +29,7 @@
#include <fcntl.h>
#include "config.h"
#include "macro.h"
#include "raw_syscalls.h"
#include "process_utils.h"
int fexecve(int fd, char *const argv[], char *const envp[])
{
......
......@@ -27,7 +27,7 @@ noinst_HEADERS = api_extensions.h \
memory_utils.h \
monitor.h \
namespace.h \
raw_syscalls.h \
process_utils.h \
rexec.h \
start.h \
state.h \
......@@ -128,7 +128,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \
network.c network.h \
monitor.c monitor.h \
parse.c parse.h \
raw_syscalls.c raw_syscalls.h \
process_utils.c process_utils.h \
ringbuf.c ringbuf.h \
rtnl.c rtnl.h \
state.c state.h \
......@@ -384,7 +384,7 @@ init_lxc_SOURCES = cmd/lxc_init.c \
initutils.c initutils.h \
memory_utils.h \
parse.c parse.h \
raw_syscalls.c raw_syscalls.h \
process_utils.c process_utils.h \
syscall_numbers.h \
string_utils.c string_utils.h
......@@ -395,7 +395,7 @@ lxc_monitord_SOURCES = cmd/lxc_monitord.c \
log.c log.h \
mainloop.c mainloop.h \
monitor.c monitor.h \
raw_syscalls.c raw_syscalls.h \
process_utils.c process_utils.h \
syscall_numbers.h \
utils.c utils.h
lxc_user_nic_SOURCES = cmd/lxc_user_nic.c \
......@@ -404,7 +404,7 @@ lxc_user_nic_SOURCES = cmd/lxc_user_nic.c \
memory_utils.h \
network.c network.h \
parse.c parse.h \
raw_syscalls.c raw_syscalls.h \
process_utils.c process_utils.h \
syscall_numbers.h \
file_utils.c file_utils.h \
string_utils.c string_utils.h \
......
......@@ -18,7 +18,7 @@
#include "log.h"
#include "macro.h"
#include "memory_utils.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "utils.h"
#ifndef HAVE_STRLCPY
......
......@@ -40,7 +40,7 @@
#include "mainloop.h"
#include "memory_utils.h"
#include "namespace.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "syscall_wrappers.h"
#include "terminal.h"
#include "utils.h"
......
......@@ -1149,7 +1149,7 @@ static int mkdir_eexist_on_last(const char *dir, mode_t mode)
ret = mkdir(makeme, mode);
if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
return log_warn_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
} while (tmp != dir);
return 0;
......@@ -1179,9 +1179,9 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
ret = mkdir_eexist_on_last(limit_path, 0755);
if (ret < 0)
return log_error_errno(false, errno,
"Failed to create %s limiting cgroup",
limit_path);
return log_debug_errno(false,
errno, "Failed to create %s limiting cgroup",
limit_path);
h->cgfd_limit = lxc_open_dirfd(limit_path);
if (h->cgfd_limit < 0)
......@@ -1208,7 +1208,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
* directory for us to ensure correct initialization.
*/
if (ret_cpuset != 1 || cgroup_tree)
return log_error_errno(false, errno, "Failed to create %s cgroup", path);
return log_debug_errno(false, errno, "Failed to create %s cgroup", path);
}
if (payload) {
......@@ -1351,7 +1351,7 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
monitor_cgroup, false, NULL))
continue;
ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
for (int j = 0; j < i; j++)
cgroup_tree_leaf_remove(ops->hierarchies[j], false);
......@@ -1361,7 +1361,7 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
} while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
if (idx == 1000 || (!suffix && idx != 0))
return ret_set_errno(false, ERANGE);
return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
ops->monitor_cgroup = move_ptr(monitor_cgroup);
return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
......@@ -1455,7 +1455,7 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
limiting_cgroup))
continue;
ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
for (int j = 0; j < i; j++)
cgroup_tree_leaf_remove(ops->hierarchies[j], true);
......@@ -1465,7 +1465,7 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
} while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
if (idx == 1000 || (!suffix && idx != 0))
return ret_set_errno(false, ERANGE);
return log_error_errno(false, ERANGE, "Failed to create container cgroup");
ops->container_cgroup = move_ptr(container_cgroup);
INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
......
......@@ -28,7 +28,7 @@
#include "initutils.h"
#include "memory_utils.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "string_utils.h"
/* option keys for long only options */
......
......@@ -28,7 +28,7 @@
#include "log.h"
#include "mainloop.h"
#include "monitor.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "utils.h"
#define CLIENTFDS_CHUNK 64
......
......@@ -36,7 +36,7 @@
#include "memory_utils.h"
#include "network.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "string_utils.h"
#include "syscall_wrappers.h"
#include "utils.h"
......
......@@ -51,7 +51,7 @@
#include "namespace.h"
#include "network.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "ringbuf.h"
#include "start.h"
#include "storage.h"
......@@ -3245,7 +3245,7 @@ static bool verify_start_hooks(struct lxc_conf *conf)
static bool execveat_supported(void)
{
lxc_raw_execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
if (errno == ENOSYS)
return false;
......
......@@ -14,7 +14,7 @@
#include "config.h"
#include "log.h"
#include "start.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "utils.h"
lxc_log_define(execute, start);
......@@ -66,7 +66,7 @@ static int execute_start(struct lxc_handler *handler, void* data)
NOTICE("Exec'ing \"%s\"", my_args->argv[0]);
if (my_args->init_fd >= 0)
lxc_raw_execveat(my_args->init_fd, "", argv, environ, AT_EMPTY_PATH);
execveat(my_args->init_fd, "", argv, environ, AT_EMPTY_PATH);
else
execvp(argv[0], argv);
SYSERROR("Failed to exec %s", argv[0]);
......
......@@ -19,7 +19,7 @@
#include "log.h"
#include "lsm.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "utils.h"
lxc_log_define(apparmor, lsm);
......
......@@ -49,7 +49,7 @@
#include "namespace.h"
#include "network.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "start.h"
#include "state.h"
#include "storage.h"
......
......@@ -21,33 +21,6 @@
lxc_log_define(namespace, lxc);
/*
* Let's use the "standard stack limit" (i.e. glibc thread size default) for
* stack sizes: 8MB.
*/
#define __LXC_STACK_SIZE (8 * 1024 * 1024)
pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd)
{
pid_t ret;
void *stack;
stack = malloc(__LXC_STACK_SIZE);
if (!stack) {
SYSERROR("Failed to allocate clone stack");
return -ENOMEM;
}
#ifdef __ia64__
ret = __clone2(fn, stack, __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd);
#else
ret = clone(fn, stack + __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd);
#endif
if (ret < 0)
SYSERROR("Failed to clone (%#x)", flags);
return ret;
}
/* Leave the user namespace at the first position in the array of structs so
* that we always attach to it first when iterating over the struct and using
* setns() to switch namespaces. This especially affects lxc_attach(): Suppose
......
......@@ -7,63 +7,6 @@
#include <unistd.h>
#include <sys/syscall.h>
#ifndef CLONE_PARENT_SETTID
#define CLONE_PARENT_SETTID 0x00100000
#endif
#ifndef CLONE_CHILD_CLEARTID
#define CLONE_CHILD_CLEARTID 0x00200000
#endif
#ifndef CLONE_CHILD_SETTID
#define CLONE_CHILD_SETTID 0x01000000
#endif
#ifndef CLONE_VFORK
#define CLONE_VFORK 0x00004000
#endif
#ifndef CLONE_THREAD
#define CLONE_THREAD 0x00010000
#endif
#ifndef CLONE_SETTLS
#define CLONE_SETTLS 0x00080000
#endif
#ifndef CLONE_VM
#define CLONE_VM 0x00000100
#endif
#ifndef CLONE_FILES
#define CLONE_FILES 0x00000400
#endif
#ifndef CLONE_FS
# define CLONE_FS 0x00000200
#endif
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000
#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000
#endif
#ifndef CLONE_NEWIPC
# define CLONE_NEWIPC 0x08000000
#endif
#ifndef CLONE_NEWUSER
# define CLONE_NEWUSER 0x10000000
#endif
#ifndef CLONE_NEWPID
# define CLONE_NEWPID 0x20000000
#endif
#ifndef CLONE_NEWNET
# define CLONE_NEWNET 0x40000000
#endif
enum {
LXC_NS_USER,
LXC_NS_MNT,
......@@ -82,39 +25,6 @@ extern const struct ns_info {
const char *env_name;
} ns_info[LXC_NS_MAX];
#if defined(__ia64__)
int __clone2(int (*__fn) (void *__arg), void *__child_stack_base,
size_t __child_stack_size, int __flags, void *__arg, ...);
#else
int clone(int (*fn)(void *), void *child_stack,
int flags, void *arg, ...
/* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ );
#endif
/**
* lxc_clone() - create a new process
*
* - allocate stack:
* This function allocates a new stack the size of page and passes it to the
* kernel.
*
* - support all CLONE_*flags:
* This function supports all CLONE_* flags. If in doubt or not sufficiently
* familiar with process creation in the kernel and interactions with libcs
* this function should be used.
*
* - pthread_atfork() handlers depending on libc:
* Whether this function runs pthread_atfork() handlers depends on the
* corresponding libc wrapper. glibc currently does not run pthread_atfork()
* handlers but does not guarantee that they are not. Other libcs might or
* might not run pthread_atfork() handlers. If you require guarantees please
* refer to the lxc_raw_clone*() functions in raw_syscalls.{c,h}.
*
* - should call lxc_raw_getpid():
* The child should use lxc_raw_getpid() to retrieve its pid.
*/
extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
extern int lxc_namespace_2_cloneflag(const char *namespace);
extern int lxc_namespace_2_ns_idx(const char *namespace);
extern int lxc_namespace_2_std_identifiers(char *namespaces);
......
......@@ -36,7 +36,7 @@
#include "memory_utils.h"
#include "network.h"
#include "nl.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "syscall_wrappers.h"
#include "utils.h"
......
......@@ -13,15 +13,12 @@
#include "compiler.h"
#include "config.h"
#include "log.h"
#include "macro.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "syscall_numbers.h"
int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[],
char *const envp[], int flags)
{
return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags);
}
lxc_log_define(process_utils, lxc);
/*
* This is based on raw_clone in systemd but adapted to our needs. This uses
......@@ -31,16 +28,8 @@ int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[],
* The nice thing about this is that we get fork() behavior. That is
* lxc_raw_clone() returns 0 in the child and the child pid in the parent.
*/
__returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd)
{
/*
* These flags don't interest at all so we don't jump through any hoops
* of retrieving them and passing them to the kernel.
*/
errno = EINVAL;
if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
return -EINVAL;
#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
/* On s390/s390x and cris the order of the first and second arguments
......@@ -100,6 +89,31 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
#endif
}
__returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
{
pid_t pid;
struct lxc_clone_args args = {
.flags = flags,
.pidfd = ptr_to_u64(pidfd),
};
if (flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID | CLONE_SETTLS))
return ret_errno(EINVAL);
/* On CLONE_PARENT we inherit the parent's exit signal. */
if (!(flags & CLONE_PARENT))
args.exit_signal = SIGCHLD;
pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0);
if (pid < 0 && errno == ENOSYS) {
SYSTRACE("Falling back to legacy clone");
return __lxc_raw_clone(flags, pidfd);
}
return pid;
}
pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags,
int *pidfd)
{
......@@ -124,3 +138,30 @@ int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
{
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
}
/*
* Let's use the "standard stack limit" (i.e. glibc thread size default) for
* stack sizes: 8MB.
*/
#define __LXC_STACK_SIZE (8 * 1024 * 1024)
pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd)
{
pid_t ret;
void *stack;
stack = malloc(__LXC_STACK_SIZE);
if (!stack) {
SYSERROR("Failed to allocate clone stack");
return -ENOMEM;
}
#ifdef __ia64__
ret = __clone2(fn, stack, __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd);
#else
ret = clone(fn, stack + __LXC_STACK_SIZE, flags | SIGCHLD, arg, pidfd);
#endif
if (ret < 0)
SYSERROR("Failed to clone (%#x)", flags);
return ret;
}
/* SPDX-License-Identifier: LGPL-2.1+ */
#ifndef __LXC_PROCESS_UTILS_H
#define __LXC_PROCESS_UTILS_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include <linux/sched.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "compiler.h"
#include "config.h"
#include "syscall_numbers.h"
#ifndef CSIGNAL
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#endif
#ifndef CLONE_VM
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#endif
#ifndef CLONE_FS
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#endif
#ifndef CLONE_FILES
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#endif
#ifndef CLONE_SIGHAND
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#endif
#ifndef CLONE_PIDFD
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#endif
#ifndef CLONE_PTRACE
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#endif
#ifndef CLONE_VFORK
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#endif
#ifndef CLONE_PARENT
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#endif
#ifndef CLONE_THREAD
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#endif
#ifndef CLONE_NEWNS
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#endif
#ifndef CLONE_SYSVSEM
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#endif
#ifndef CLONE_SETTLS
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#endif
#ifndef CLONE_PARENT_SETTID
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#endif
#ifndef CLONE_CHILD_CLEARTID
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#endif
#ifndef CLONE_DETACHED
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#endif
#ifndef CLONE_UNTRACED
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#endif
#ifndef CLONE_CHILD_SETTID
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#endif
#ifndef CLONE_NEWCGROUP
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#endif
#ifndef CLONE_NEWUTS
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#endif
#ifndef CLONE_NEWIPC
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#endif
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#endif
#ifndef CLONE_NEWPID
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#endif
#ifndef CLONE_NEWNET
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#endif
#ifndef CLONE_IO
#define CLONE_IO 0x80000000 /* Clone io context */
#endif
/* Flags for the clone3() syscall. */
#ifndef CLONE_CLEAR_SIGHAND
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#endif
#ifndef CLONE_INTO_CGROUP
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#endif
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
* syscalls only:
*/
#ifndef CLONE_NEWTIME
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
#endif
/* waitid */
#ifndef P_PIDFD
#define P_PIDFD 3
#endif
#ifndef CLONE_ARGS_SIZE_VER0
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#endif
#ifndef CLONE_ARGS_SIZE_VER1
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#endif
#ifndef CLONE_ARGS_SIZE_VER2
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
#endif
#ifndef ptr_to_u64
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
#endif
#ifndef u64_to_ptr
#define u64_to_ptr(x) ((void *)(uintptr_t)x)
#endif
struct lxc_clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
};
__returns_twice static inline pid_t lxc_clone3(struct lxc_clone_args *args, size_t size)
{
return syscall(__NR_clone3, args, size);
}
#if defined(__ia64__)
int __clone2(int (*__fn)(void *__arg), void *__child_stack_base,
size_t __child_stack_size, int __flags, void *__arg, ...);
#else
int clone(int (*fn)(void *), void *child_stack, int flags, void *arg, ...
/* pid_t *ptid, struct user_desc *tls, pid_t *ctid */);
#endif
/**
* lxc_clone() - create a new process
*
* - allocate stack:
* This function allocates a new stack the size of page and passes it to the
* kernel.
*
* - support all CLONE_*flags:
* This function supports all CLONE_* flags. If in doubt or not sufficiently
* familiar with process creation in the kernel and interactions with libcs
* this function should be used.
*
* - pthread_atfork() handlers depending on libc:
* Whether this function runs pthread_atfork() handlers depends on the
* corresponding libc wrapper. glibc currently does not run pthread_atfork()
* handlers but does not guarantee that they are not. Other libcs might or
* might not run pthread_atfork() handlers. If you require guarantees please
* refer to the lxc_raw_clone*() functions in process_utils.{c,h}.
*
* - should call lxc_raw_getpid():
* The child should use lxc_raw_getpid() to retrieve its pid.
*/
extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
/*
* lxc_raw_clone() - create a new process
*
* - fork() behavior:
* This function returns 0 in the child and > 0 in the parent.
*
* - copy-on-write:
* This function does not allocate a new stack and relies on copy-on-write
* semantics.
*
* - supports subset of ClONE_* flags:
* lxc_raw_clone() intentionally only supports a subset of the flags available
* to the actual system call. Please refer to the implementation what flags
* cannot be used. Also, please don't assume that just because a flag isn't
* explicitly checked for as being unsupported that it is supported. If in
* doubt or not sufficiently familiar with process creation in the kernel and
* interactions with libcs this function should be used.
*
* - no pthread_atfork() handlers:
* This function circumvents - as much as this this is possible - any libc
* wrappers and thus does not run any pthread_atfork() handlers. Make sure
* that this is safe to do in the context you are trying to call this
* function.
*
* - must call lxc_raw_getpid():
* The child must use lxc_raw_getpid() to retrieve its pid.
*/
extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
/*
* lxc_raw_clone_cb() - create a new process
*
* - non-fork() behavior:
* Function does return pid of the child or -1 on error. Pass in a callback
* function via the "fn" argument that gets executed in the child process.
* The "args" argument is passed to "fn".
*
* All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb()
* as well.
*/
extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args,
unsigned long flags, int *pidfd);
#ifndef HAVE_EXECVEAT
static inline int execveat(int dirfd, const char *pathname, char *const argv[],
char *const envp[], int flags)
{
return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags);
}
#else
extern int execveat(int dirfd, const char *pathname, char *const argv[],
char *const envp[], int flags);
#endif
/*
* Because of older glibc's pid cache (up to 2.25) whenever clone() is called
* the child must must retrieve it's own pid via lxc_raw_getpid().
*/
static inline pid_t lxc_raw_getpid(void)
{
return (pid_t)syscall(SYS_getpid);
}
static inline pid_t lxc_raw_gettid(void)
{
#if __NR_gettid > 0
return syscall(__NR_gettid);
#else
return lxc_raw_getpid();
#endif
}
extern int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags);
#endif /* __LXC_PROCESS_UTILS_H */
/* SPDX-License-Identifier: LGPL-2.1+ */
#ifndef __LXC_RAW_SYSCALL_H
#define __LXC_RAW_SYSCALL_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
/* clone */
#ifndef CLONE_PIDFD
#define CLONE_PIDFD 0x00001000
#endif
/* waitid */
#ifndef P_PIDFD
#define P_PIDFD 3
#endif
/*
* lxc_raw_clone() - create a new process
*
* - fork() behavior:
* This function returns 0 in the child and > 0 in the parent.
*
* - copy-on-write:
* This function does not allocate a new stack and relies on copy-on-write
* semantics.
*
* - supports subset of ClONE_* flags:
* lxc_raw_clone() intentionally only supports a subset of the flags available
* to the actual system call. Please refer to the implementation what flags
* cannot be used. Also, please don't assume that just because a flag isn't
* explicitly checked for as being unsupported that it is supported. If in
* doubt or not sufficiently familiar with process creation in the kernel and
* interactions with libcs this function should be used.
*
* - no pthread_atfork() handlers:
* This function circumvents - as much as this this is possible - any libc
* wrappers and thus does not run any pthread_atfork() handlers. Make sure
* that this is safe to do in the context you are trying to call this
* function.
*
* - must call lxc_raw_getpid():
* The child must use lxc_raw_getpid() to retrieve its pid.
*/
extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
/*
* lxc_raw_clone_cb() - create a new process
*
* - non-fork() behavior:
* Function does return pid of the child or -1 on error. Pass in a callback
* function via the "fn" argument that gets executed in the child process.
* The "args" argument is passed to "fn".
*
* All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb()
* as well.
*/
extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args,
unsigned long flags, int *pidfd);
extern int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[],
char *const envp[], int flags);
/*
* Because of older glibc's pid cache (up to 2.25) whenever clone() is called
* the child must must retrieve it's own pid via lxc_raw_getpid().
*/
static inline pid_t lxc_raw_getpid(void)
{
return (pid_t)syscall(SYS_getpid);
}
static inline pid_t lxc_raw_gettid(void)
{
#if __NR_gettid > 0
return syscall(__NR_gettid);
#else
return lxc_raw_getpid();
#endif
}
extern int lxc_raw_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags);
#endif /* __LXC_RAW_SYSCALL_H */
......@@ -13,7 +13,7 @@
#include "file_utils.h"
#include "macro.h"
#include "memory_utils.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "string_utils.h"
#include "syscall_wrappers.h"
......
......@@ -47,7 +47,7 @@
#include "monitor.h"
#include "namespace.h"
#include "network.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "start.h"
#include "storage/storage.h"
#include "storage/storage_utils.h"
......
......@@ -40,7 +40,7 @@
#elif defined __sparc__
#define __NR_keyctl 283
#elif defined __ia64__
#define __NR_keyctl 249
#define __NR_keyctl (249 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_keyctl 4282
......@@ -112,7 +112,7 @@
#elif defined __sparc__
#define __NR_pivot_root 146
#elif defined __ia64__
#define __NR_pivot_root 183
#define __NR_pivot_root (183 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_pivot_root 4216
......@@ -147,7 +147,7 @@
#elif defined __sparc__
#define __NR_setns 337
#elif defined __ia64__
#define __NR_setns 306
#define __NR_setns (306 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_setns 4344
......@@ -182,7 +182,7 @@
#elif defined __sparc__
#define __NR_sethostname 88
#elif defined __ia64__
#define __NR_sethostname 59
#define __NR_sethostname (59 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_sethostname 474
......@@ -217,7 +217,7 @@
#elif defined __sparc__
#define __NR_signalfd 311
#elif defined __ia64__
#define __NR_signalfd 283
#define __NR_signalfd (283 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_signalfd 4317
......@@ -252,7 +252,7 @@
#elif defined __sparc__
#define __NR_signalfd4 317
#elif defined __ia64__
#define __NR_signalfd4 289
#define __NR_signalfd4 (289 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_signalfd4 4324
......@@ -287,7 +287,7 @@
#elif defined __sparc__
#define __NR_unshare 299
#elif defined __ia64__
#define __NR_unshare 272
#define __NR_unshare (272 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_unshare 4303
......@@ -322,7 +322,7 @@
#elif defined __sparc__
#define __NR_bpf 349
#elif defined __ia64__
#define __NR_bpf 317
#define __NR_bpf (317 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_bpf 4355
......@@ -357,7 +357,7 @@
#elif defined __sparc__
#define __NR_faccessat 296
#elif defined __ia64__
#define __NR_faccessat 269
#define __NR_faccessat (269 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_faccessat 4300
......@@ -387,6 +387,8 @@
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_pidfd_send_signal 5424
#endif
#elif defined __ia64__
#define __NR_pidfd_send_signal (424 + 1024)
#else
#define __NR_pidfd_send_signal 424
#endif
......@@ -410,7 +412,7 @@
#elif defined __sparc__
#define __NR_seccomp 346
#elif defined __ia64__
#define __NR_seccomp 329
#define __NR_seccomp (329 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_seccomp 4352
......@@ -445,7 +447,7 @@
#elif defined __sparc__
#define __NR_gettid 143
#elif defined __ia64__
#define __NR_gettid 81
#define __NR_gettid (81 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_gettid 4222
......@@ -484,7 +486,7 @@
#elif defined __sparc__
#define __NR_execveat 350
#elif defined __ia64__
#define __NR_execveat 318
#define __NR_execveat (318 + 1024)
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_execveat 4356
......@@ -514,6 +516,8 @@
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_move_mount 5429
#endif
#elif defined __ia64__
#define __NR_move_mount (428 + 1024)
#else
#define __NR_move_mount 429
#endif
......@@ -532,9 +536,31 @@
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_open_tree 5428
#endif
#elif defined __ia64__
#define __NR_open_tree (428 + 1024)
#else
#define __NR_open_tree 428
#endif
#endif
#ifndef __NR_clone3
#if defined __alpha__
#define __NR_clone3 545
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_clone3 4435
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_clone3 6435
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_clone3 5435
#endif
#elif defined __ia64__
#define __NR_clone3 (435 + 1024)
#else
#define __NR_clone3 435
#endif
#endif
#endif /* __LXC_SYSCALL_NUMBERS_H */
......@@ -35,7 +35,7 @@
#include "memory_utils.h"
#include "namespace.h"
#include "parse.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "syscall_wrappers.h"
#include "utils.h"
......
......@@ -25,7 +25,7 @@
#include "initutils.h"
#include "macro.h"
#include "memory_utils.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "string_utils.h"
/* returns 1 on success, 0 if there were any failures */
......
......@@ -30,7 +30,7 @@ lxc_test_parse_config_file_SOURCES = parse_config_file.c \
lxc_test_raw_clone_SOURCES = lxc_raw_clone.c \
lxctest.h \
../lxc/namespace.c ../lxc/namespace.h \
../lxc/raw_syscalls.c ../lxc/raw_syscalls.h
../lxc/process_utils.c ../lxc/process_utils.h
../lxc/utils.c ../lxc/utils.h
lxc_test_reboot_SOURCES = reboot.c
lxc_test_saveconfig_SOURCES = saveconfig.c
......
......@@ -39,7 +39,7 @@
#include "lxctest.h"
#include "namespace.h"
#include "raw_syscalls.h"
#include "process_utils.h"
#include "utils.h"
int main(int argc, char *argv[])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment