diff --git a/pkg/unshare/unshare.c b/pkg/unshare/unshare.c index 8969191fa..dc7b9d570 100644 --- a/pkg/unshare/unshare.c +++ b/pkg/unshare/unshare.c @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include /* Open Source projects like conda-forge, want to package podman and are based off of centos:6, Conda-force has minimal libc requirements and is lacking @@ -151,16 +154,74 @@ static char **parse_proc_stringlist(const char *list) { return ret; } -static int containers_reexec(void) { - char **argv, *exename; +/* + * Taken from the runc cloned_binary.c file + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * This work is dual licensed under the following licenses. You may use, + * redistribute, and/or modify the work under the conditions of either (or + * both) licenses. + * + * === Apache-2.0 === + */ +static int try_bindfd(void) +{ + int fd, ret = -1; + char src[PATH_MAX] = {0}; + char template[64] = {0}; + + strncpy(template, "/tmp/containers.XXXXXX", sizeof(template) - 1); + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + ret = -EPERM; + + if (readlink("/proc/self/exe", src, sizeof (src) - 1) < 0) + goto out; + + if (mount(src, template, NULL, MS_BIND, NULL) < 0) + goto out; + if (mount(NULL, template, NULL, MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) + goto out_umount; + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + +static int copy_self_proc_exe(char **argv) { + char *exename; int fd, mmfd, n_read, n_written; struct stat st; char buf[2048]; - argv = parse_proc_stringlist("/proc/self/cmdline"); - if (argv == NULL) { - return -1; - } fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); if (fd == -1) { fprintf(stderr, "open(\"/proc/self/exe\"): %m\n"); @@ -168,13 +229,14 @@ static int containers_reexec(void) { } if (fstat(fd, &st) == -1) { fprintf(stderr, "fstat(\"/proc/self/exe\"): %m\n"); + close(fd); return -1; } exename = basename(argv[0]); mmfd = syscall(SYS_memfd_create, exename, (long) MFD_ALLOW_SEALING | MFD_CLOEXEC); if (mmfd == -1) { fprintf(stderr, "memfd_create(): %m\n"); - return -1; + goto close_fd; } for (;;) { n_read = read(fd, buf, sizeof(buf)); @@ -188,21 +250,45 @@ static int containers_reexec(void) { n_written = write(mmfd, buf, n_read); if (n_written < 0) { fprintf(stderr, "write(anonfd): %m\n"); - return -1; + goto close_fd; } if (n_written != n_read) { fprintf(stderr, "write(anonfd): short write (%d != %d)\n", n_written, n_read); - return -1; + goto close_fd; } } close(fd); if (fcntl(mmfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL) == -1) { - close(mmfd); - fprintf(stderr, "Error sealing memfd copy: %m\n"); + fprintf(stderr, "Close_Fd sealing memfd copy: %m\n"); + goto close_mmfd; + } + + return mmfd; + +close_fd: + close(fd); +close_mmfd: + close(mmfd); + return -1; +} +static int containers_reexec(int flags) { + char **argv; + int fd = -1; + + argv = parse_proc_stringlist("/proc/self/cmdline"); + if (argv == NULL) { return -1; } - if (fexecve(mmfd, argv, environ) == -1) { - close(mmfd); + + if (flags & CLONE_NEWNS) + fd = try_bindfd(); + if (fd < 0) + fd = copy_self_proc_exe(argv); + if (fd < 0) + return fd; + + if (fexecve(fd, argv, environ) == -1) { + close(fd); fprintf(stderr, "Error during reexec(...): %m\n"); return -1; } @@ -282,7 +368,7 @@ void _containers_unshare(void) _exit(1); } } - if (containers_reexec() != 0) { + if (containers_reexec(flags) != 0) { _exit(1); } return;