diff --git a/docs/1-installing.md b/docs/1-installing.md index b1c2822..44cb689 100644 --- a/docs/1-installing.md +++ b/docs/1-installing.md @@ -35,7 +35,7 @@ To also set up crun-vm for use with Docker: 1. Install crun-vm's runtime dependencies: ```console - $ dnf install bash coreutils crun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core shadow-utils util-linux virtiofsd + $ dnf install bash coreutils crun crun-krun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core sed shadow-utils util-linux virtiofsd ``` 2. Install Rust and Cargo if you do not already have Rust tooling available: diff --git a/docs/2-podman-docker.md b/docs/2-podman-docker.md index 47be99c..7719b9f 100644 --- a/docs/2-podman-docker.md +++ b/docs/2-podman-docker.md @@ -96,6 +96,21 @@ in a container image. Note that flag `--persistent` has no effect when running VMs from container images. +### From bootable container images + +crun-vm can also work with [bootable container images], which are containers +that package a full operating system: + +```console +$ podman run \ + --runtime crun-vm \ + -it --rm \ + quay.io/fedora/fedora-bootc:40 +``` + +Internally, crun-vm generates a VM image from the bootable container and then +boots it. + ## First-boot customization ### cloud-init @@ -320,6 +335,9 @@ To use system emulation instead of hardware-assisted virtualization, specify the `--emulated` flag. Without this flag, attempting to create a VM on a host tbat doesn't support KVM will fail. +It's not currently possible to use this flag when the container image is a bootc +bootable container. + ### Inspecting and customizing the libvirt domain XML crun-vm internally uses [libvirt] to launch a VM, generating a [domain XML @@ -340,6 +358,7 @@ be merged with it using the non-standard option `--merge-libvirt-xml `. > Before using this flag, consider if you would be better served using libvirt > directly to manage your VM. +[bootable container images]: https://containers.github.io/bootable/ [cloud-init]: https://cloud-init.io/ [domain XML definition]: https://libvirt.org/formatdomain.html [Ignition]: https://coreos.github.io/ignition/ diff --git a/embed/bootc/config.json b/embed/bootc/config.json new file mode 100644 index 0000000..a40fc6c --- /dev/null +++ b/embed/bootc/config.json @@ -0,0 +1,88 @@ +{ + "ociVersion": "1.0.0", + "process": { + "terminal": true, + "user": { "uid": 0, "gid": 0 }, + "args": ["/output/entrypoint.sh", ""], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [], + "effective": [], + "inheritable": [], + "permitted": [], + "ambient": [] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 262144, + "soft": 262144 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "", + "readonly": false + }, + "hostname": "bootc-install", + "mounts": [ + { + "type": "bind", + "source": "/root/crun-vm/bootc", + "destination": "/output", + "options": ["bind", "rprivate", "rw"] + }, + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + } + ], + "linux": { + "namespaces": [ + { "type": "pid" }, + { "type": "network" }, + { "type": "ipc" }, + { "type": "uts" }, + { "type": "cgroup" }, + { "type": "mount" } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} diff --git a/embed/bootc/entrypoint.sh b/embed/bootc/entrypoint.sh new file mode 100644 index 0000000..d2bc3ea --- /dev/null +++ b/embed/bootc/entrypoint.sh @@ -0,0 +1,51 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later + +set -e + +image_name=$1 + +# monkey-patch loopdev partition detection, given we're not running systemd +# (bootc runs `udevadm settle` as a way to wait until loopdev partitions are +# detected; we hijack that call and use partx to set up the partition devices) + +original_udevadm=$( which udevadm ) + +mkdir -p /output/bin + +cat >/output/bin/udevadm </usr/lib/bootc/install/00-crun-vm.toml < "$bootc_dir/progress" 2>&1 + +# this blocks here until the named pipe above is opened by entrypoint.sh + +# get info about the container *image* + +__step 'Storing the container image as an OCI archive...' + +image_info=$( + podman container inspect \ + --format '{{.ImageName}}\t{{.Image}}' \ + "$container_id" + ) + +image_name=$( cut -f1 <<< "$image_info" ) +image_id=$( cut -f2 <<< "$image_info" ) + +oci_archive=$bootc_dir/image.oci-archive + +# save container *image* as an OCI archive + +podman save --format oci-archive --output "$oci_archive.tmp" "$image_id" +mv "$oci_archive.tmp" "$oci_archive" + +# adjust krun config + +__step 'Generating a VM image from the container image...' + +__sed() { + sed -i "s|$1|$2|" "$bootc_dir/config.json" +} + +__sed "" "$image_name" +__sed "" "$original_root" +__sed "" "$priv_dir" + +# run bootc-install under krun + +truncate --size 10G "$bootc_dir/image.raw" # TODO: allow adjusting disk size + +krun run \ + --config "$bootc_dir/config.json" \ + "crun-vm-$container_id" \ + ]) -> Resu let config_path = bundle_path.join("config.json"); let mut spec = oci_spec::runtime::Spec::load(&config_path)?; + ensure_unprivileged(&spec)?; + let original_root_path: Utf8PathBuf = spec.root_path()?.canonicalize()?.try_into()?; // ensure absolute - if let Some(process) = spec.process().as_ref() { - if let Some(capabilities) = process.capabilities().as_ref() { - fn any_is_cap_sys_admin(caps: &Option) -> bool { - caps.as_ref() - .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) - } - - ensure!( - !any_is_cap_sys_admin(capabilities.bounding()) - && !any_is_cap_sys_admin(capabilities.effective()) - && !any_is_cap_sys_admin(capabilities.inheritable()) - && !any_is_cap_sys_admin(capabilities.permitted()) - && !any_is_cap_sys_admin(capabilities.ambient()), - "crun-vm is incompatible with privileged containers" - ); - } - } - let runtime_env = RuntimeEnv::current(&spec, &original_root_path)?; let custom_options = CustomOptions::from_spec(&spec, runtime_env)?; + let is_bootc_container = is_bootc_container( + &args.container_id, + bundle_path, + &original_root_path, + runtime_env, + )?; + + ensure!( + !is_bootc_container || !custom_options.emulated, + "--emulated is incompatible with bootable containers" + ); + // We include container_id in our paths to ensure no overlap with the user container's contents. let priv_dir_path = original_root_path.join(format!("crun-vm-{}", args.container_id)); fs::create_dir_all(&priv_dir_path)?; @@ -66,7 +64,13 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu set_file_context(&priv_dir_path, context)?; } - set_up_container_root(&mut spec, &priv_dir_path, &custom_options)?; + set_up_container_root( + &mut spec, + &priv_dir_path, + &custom_options, + is_bootc_container, + )?; + let is_first_create = is_first_create(&spec)?; let base_vm_image_info = set_up_vm_image( @@ -75,6 +79,7 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu &priv_dir_path, &custom_options, is_first_create, + is_bootc_container, )?; let mut mounts = Mounts::default(); @@ -105,9 +110,84 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu crun(raw_args)?; // actually create container + if is_first_create && is_bootc_container { + // We want to ask podman what our image name is, so we can give it to bootc-install, but we + // can't wait synchronously for a response since podman hangs until this create command + // completes. We then want to run bootc-install under krun, which already isolates the + // workload and so can be run outside of our container. We thus launch a process that + // asynchronously performs these steps, and share its progress and output with out + // container's entrypoint through a named pipe. + // + // Note that this process blocks until our container's entrypoint actually starts running, + // thus after the "start" OCI runtime command is called. + + let bootc_dir = priv_dir_path.join("root/crun-vm/bootc"); + fs::create_dir_all(&bootc_dir)?; + + std::process::Command::new(bootc_dir.join("prepare.sh")) + .arg(&original_root_path) + .arg(&priv_dir_path) + .arg(&args.container_id) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + } + Ok(()) } +fn ensure_unprivileged(spec: &oci_spec::runtime::Spec) -> Result<()> { + if let Some(process) = spec.process().as_ref() { + if let Some(capabilities) = process.capabilities().as_ref() { + fn any_is_cap_sys_admin(caps: &Option) -> bool { + caps.as_ref() + .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) + } + + ensure!( + !any_is_cap_sys_admin(capabilities.bounding()) + && !any_is_cap_sys_admin(capabilities.effective()) + && !any_is_cap_sys_admin(capabilities.inheritable()) + && !any_is_cap_sys_admin(capabilities.permitted()) + && !any_is_cap_sys_admin(capabilities.ambient()), + "crun-vm is incompatible with privileged containers" + ); + } + } + + Ok(()) +} + +fn is_bootc_container( + container_id: &str, + bundle_path: &Utf8Path, + original_root_path: &Utf8Path, + env: RuntimeEnv, +) -> Result { + lazy_static! { + static ref PATTERN: Regex = Regex::new(r"/overlay-containers/([^/]+)/userdata$").unwrap(); + } + + let is_bootc_container = original_root_path.join("usr/lib/bootc/install").is_dir(); + + if is_bootc_container { + // check as much as we can that we're running under podman + + let is_podman_bundle_path = match PATTERN.captures(bundle_path.as_str()) { + Some(captures) => &captures[1] == container_id, + None => false, + }; + + ensure!( + env == RuntimeEnv::Other && is_podman_bundle_path, + "bootc containers are only supported with Podman" + ); + } + + Ok(is_bootc_container) +} + fn is_first_create(spec: &oci_spec::runtime::Spec) -> Result { let path = spec.root_path()?.join("crun-vm/create-ran"); @@ -129,6 +209,7 @@ fn set_up_container_root( spec: &mut oci_spec::runtime::Spec, priv_dir_path: &Utf8Path, custom_options: &CustomOptions, + is_bootc_container: bool, ) -> Result<()> { let new_root_path = priv_dir_path.join("root"); fs::create_dir_all(&new_root_path)?; @@ -148,19 +229,22 @@ fn set_up_container_root( .unwrap(), )); - // set up container scripts + // set up container files #[derive(RustEmbed)] - #[folder = "scripts/"] - struct Scripts; + #[folder = "embed/"] + struct Embed; - for path in Scripts::iter() { + for path in Embed::iter() { let path_in_host = new_root_path.join("crun-vm").join(path.as_ref()); fs::create_dir_all(path_in_host.parent().unwrap())?; - let file = Scripts::get(&path).unwrap(); + let file = Embed::get(&path).unwrap(); fs::write(&path_in_host, file.data)?; - fs::set_permissions(&path_in_host, Permissions::from_mode(0o755))?; + + let is_script = path.as_ref().ends_with(".sh"); + let mode = if is_script { 0o755 } else { 0o644 }; + fs::set_permissions(&path_in_host, Permissions::from_mode(mode))?; } // configure container entrypoint @@ -170,7 +254,8 @@ fn set_up_container_root( } else if custom_options.print_config_json { vec!["cat", "/crun-vm/config.json"] } else { - vec!["/crun-vm/entrypoint.sh"] + let arg = if is_bootc_container { "1" } else { "0" }; + vec!["/crun-vm/entrypoint.sh", arg] }; spec.set_process({ @@ -194,7 +279,20 @@ fn set_up_vm_image( priv_dir_path: &Utf8Path, custom_options: &CustomOptions, is_first_create: bool, + is_bootc_container: bool, ) -> Result { + let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); + let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); + + if is_bootc_container { + // the image will be generated later + return Ok(VmImageInfo { + path: mirror_vm_image_path_in_container, + size: 0, + format: "raw".to_string(), + }); + } + // where inside the container to look for the VM image const VM_IMAGE_SEARCH_PATHS: [&str; 2] = ["./", "disk/"]; @@ -218,9 +316,6 @@ fn set_up_vm_image( fs::hard_link(vm_image_path_in_host, image_dir_path.join("image"))?; } - let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); - let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); - if custom_options.persistent { // Mount overlayfs to expose the user's VM image file with a different SELinux context so we // can always access it, using the file's parent as the upperdir so that writes still @@ -230,7 +325,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), false, )?; @@ -253,7 +348,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), true, )?; @@ -580,7 +675,7 @@ fn set_up_security(spec: &mut oci_spec::runtime::Spec) { // TODO: This doesn't seem reasonable at all. Should we just force users to use a different // seccomp profile? Should passt provide the option to bypass a lot of the isolation that it // does, given we're already in a container *and* under a seccomp profile? - spec.linux_seccomp_syscalls_push( + spec.linux_seccomp_syscalls_push_front( oci_spec::runtime::LinuxSyscallBuilder::default() .names(["mount", "pivot_root", "umount2", "unshare"].map(String::from)) .action(oci_spec::runtime::LinuxSeccompAction::ScmpActAllow) diff --git a/src/util.rs b/src/util.rs index fc0cde4..84ce83f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -33,13 +33,13 @@ pub fn fix_selinux_label(process: &mut oci_spec::runtime::Process) { pub fn set_file_context(path: impl AsRef, context: &str) -> Result<()> { extern "C" { - fn setfilecon(path: *const c_char, con: *const c_char) -> i32; + fn lsetfilecon(path: *const c_char, con: *const c_char) -> i32; } let path = CString::new(path.as_ref().as_os_str().as_bytes())?; let context = CString::new(context.as_bytes())?; - if unsafe { setfilecon(path.as_ptr(), context.as_ptr()) } != 0 { + if unsafe { lsetfilecon(path.as_ptr(), context.as_ptr()) } != 0 { return Err(io::Error::last_os_error().into()); } @@ -198,7 +198,7 @@ pub trait SpecExt { linux_device_cgroup: oci_spec::runtime::LinuxDeviceCgroup, ); fn process_capabilities_insert_beip(&mut self, capability: oci_spec::runtime::Capability); - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); + fn linux_seccomp_syscalls_push_front(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); } impl SpecExt for oci_spec::runtime::Spec { @@ -276,7 +276,10 @@ impl SpecExt for oci_spec::runtime::Spec { }); } - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall) { + fn linux_seccomp_syscalls_push_front( + &mut self, + linux_syscall: oci_spec::runtime::LinuxSyscall, + ) { self.set_linux({ let mut linux = self.linux().clone().expect("linux config"); linux.set_seccomp({ @@ -284,7 +287,7 @@ impl SpecExt for oci_spec::runtime::Spec { if let Some(seccomp) = &mut seccomp { seccomp.set_syscalls({ let mut syscalls = seccomp.syscalls().clone().unwrap_or_default(); - syscalls.push(linux_syscall); + syscalls.insert(0, linux_syscall); Some(syscalls) }); } diff --git a/tests/env.sh b/tests/env.sh index f6ed1d2..31e4397 100755 --- a/tests/env.sh +++ b/tests/env.sh @@ -13,18 +13,21 @@ declare -A TEST_IMAGES TEST_IMAGES=( [fedora]=quay.io/containerdisks/fedora:40 # uses cloud-init [coreos]=quay.io/crun-vm/example-fedora-coreos:40 # uses Ignition + [fedora-bootc]=quay.io/fedora/fedora-bootc:40 # bootable container ) declare -A TEST_IMAGES_DEFAULT_USER TEST_IMAGES_DEFAULT_USER=( [fedora]=fedora [coreos]=core + [fedora-bootc]=cloud-user ) declare -A TEST_IMAGES_DEFAULT_USER_HOME TEST_IMAGES_DEFAULT_USER_HOME=( [fedora]=/home/fedora [coreos]=/var/home/core + [fedora-bootc]=/var/home/cloud-user ) __bad_usage() { @@ -140,12 +143,12 @@ build) # expand base image - __log_and_run qemu-img create -f qcow2 "$temp_dir/resized-image.qcow2" 20G + __log_and_run qemu-img create -f qcow2 "$temp_dir/image.qcow2" 50G __log_and_run virt-resize \ --quiet \ --expand /dev/sda4 \ "$temp_dir/image" \ - "$temp_dir/resized-image.qcow2" + "$temp_dir/image.qcow2" rm "$temp_dir/image" @@ -179,6 +182,7 @@ build) bash \ coreutils \ crun \ + crun-krun \ docker \ genisoimage \ grep \ @@ -210,17 +214,12 @@ build) __log_and_run podman wait --ignore "$container_name-build" __extra_cleanup() { :; } - __log_and_run virt-sparsify \ - --quiet \ - "$temp_dir/resized-image.qcow2" \ - "$temp_dir/final-image.qcow2" - - rm "$temp_dir/resized-image.qcow2" + __log_and_run virt-sparsify --quiet --in-place "$temp_dir/image.qcow2" # package new image file __log_and_run "$( __rel "$repo_root/util/package-vm-image.sh" )" \ - "$temp_dir/final-image.qcow2" \ + "$temp_dir/image.qcow2" \ "$env_image" __big_log 33 'Done.' diff --git a/tests/t/bootc-rootfs.sh b/tests/t/bootc-rootfs.sh new file mode 100644 index 0000000..5d78b04 --- /dev/null +++ b/tests/t/bootc-rootfs.sh @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ "$ENGINE" == docker ]]; then + # we only support bootc containers under Podman + __skip +fi + +"$UTIL_DIR/extract-vm-image.sh" "${TEST_IMAGES[fedora-bootc]}" "$TEMP_DIR/image" + +__run() { + __engine run --rm --detach --name bootc-rootfs "$@" --rootfs "$TEMP_DIR" +} + +! __run +! __run --persistent diff --git a/tests/t/cloud-init.sh b/tests/t/cloud-init.sh index 6ea51dd..5ac77e1 100644 --- a/tests/t/cloud-init.sh +++ b/tests/t/cloud-init.sh @@ -1,30 +1,36 @@ # SPDX-License-Identifier: GPL-2.0-or-later -image="${TEST_IMAGES[fedora]}" -user="${TEST_IMAGES_DEFAULT_USER[fedora]}" -home="${TEST_IMAGES_DEFAULT_USER_HOME[fedora]}" +for os in fedora fedora-bootc; do -cat >"$TEMP_DIR/user-data" <"$TEMP_DIR/user-data" <<-EOF + #cloud-config + write_files: + - path: $home/file + content: | + hello EOF -cat >"$TEMP_DIR/meta-data" <"$TEMP_DIR/meta-data" <<-EOF EOF -__engine run \ - --rm --detach \ - --name cloud-init \ - "$image" \ - --cloud-init "$TEMP_DIR" + __engine run \ + --rm --detach \ + --name cloud-init \ + "$image" \ + --cloud-init "$TEMP_DIR" -__test() { - __engine exec cloud-init --as "$user" "cmp $home/file <<< hello" -} + __test() { + __engine exec cloud-init --as "$user" "cmp $home/file <<< hello" + } -__test -__engine restart cloud-init -__test + __test + __engine restart cloud-init + __test + + __engine stop cloud-init + +done diff --git a/tests/t/hostname.sh b/tests/t/hostname.sh index 3b1039c..c185f76 100644 --- a/tests/t/hostname.sh +++ b/tests/t/hostname.sh @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later -for os in fedora coreos; do +for os in "${!TEST_IMAGES[@]}"; do image="${TEST_IMAGES[$os]}" user="${TEST_IMAGES_DEFAULT_USER[$os]}" diff --git a/tests/t/mount.sh b/tests/t/mount.sh index cebb2e5..4be1085 100644 --- a/tests/t/mount.sh +++ b/tests/t/mount.sh @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later -for os in fedora coreos; do +for os in "${!TEST_IMAGES[@]}"; do image="${TEST_IMAGES[$os]}" user="${TEST_IMAGES_DEFAULT_USER[$os]}" diff --git a/tests/t/publish.sh b/tests/t/publish.sh index 4c9b98a..45ef70a 100644 --- a/tests/t/publish.sh +++ b/tests/t/publish.sh @@ -1,30 +1,33 @@ # SPDX-License-Identifier: GPL-2.0-or-later -image="${TEST_IMAGES[fedora]}" -user="${TEST_IMAGES_DEFAULT_USER[fedora]}" - -__engine run \ - --rm --detach \ - --name publish \ - --publish 127.0.0.1::8000 \ - "$image" - -endpoint=$( __engine port publish | tee /dev/stderr | cut -d' ' -f3 ) - -__engine exec publish --as "$user" - -__log 'Ensuring curl fails...' -! curl "$endpoint" 2>/dev/null - -__engine exec publish --as "$user" python -m http.server & trap '__engine stop publish' EXIT -__log 'Ensuring curl succeeds...' +for os in fedora fedora-bootc; do -i=0 -max_tries=30 + image="${TEST_IMAGES[$os]}" + user="${TEST_IMAGES_DEFAULT_USER[$os]}" + + __engine run --rm --detach --name publish --publish 127.0.0.1::8000 "$image" + + endpoint=$( __engine port publish | tee /dev/stderr | cut -d' ' -f3 ) + + __engine exec publish --as "$user" + + __log 'Ensuring curl fails...' + ! curl "$endpoint" 2>/dev/null + + __engine exec publish --as "$user" python -m http.server & + + __log 'Ensuring curl succeeds...' + + i=0 + max_tries=30 + + until [[ "$( curl "$endpoint" 2>/dev/null )" == ''* ]]; do + (( ++i < max_tries )) + sleep 1 + done + + __engine stop publish -until [[ "$( curl "$endpoint" 2>/dev/null )" == ''* ]]; do - (( ++i < max_tries )) - sleep 1 done