diff --git a/crates/runc-shim/src/synchronous/runc.rs b/crates/runc-shim/src/synchronous/runc.rs index df07eb1..1df4bf8 100644 --- a/crates/runc-shim/src/synchronous/runc.rs +++ b/crates/runc-shim/src/synchronous/runc.rs @@ -36,7 +36,7 @@ use shim::api::*; use shim::console::ConsoleSocket; use shim::error::{Error, Result}; use shim::io::Stdio; -use shim::monitor::{monitor_subscribe, ExitEvent, Subject, Subscription, Topic}; +use shim::monitor::{monitor_subscribe, wait_pid, ExitEvent, Subject, Subscription, Topic}; use shim::mount::mount_rootfs; use shim::protos::api::ProcessInfo; use shim::protos::cgroups::metrics::Metrics; @@ -672,17 +672,3 @@ where } "".to_string() } - -fn wait_pid(pid: i32, s: Subscription) -> i32 { - loop { - if let Ok(ExitEvent { - subject: Subject::Pid(epid), - exit_code: code, - }) = s.rx.recv() - { - if pid == epid { - return code; - } - } - } -} diff --git a/crates/shim/Cargo.toml b/crates/shim/Cargo.toml index 625fbc3..29c70ff 100644 --- a/crates/shim/Cargo.toml +++ b/crates/shim/Cargo.toml @@ -32,6 +32,8 @@ uuid = { version = "1.0.0", features = ["v4"] } signal-hook = "0.3.13" oci-spec = "0.5.4" prctl = "1.0.0" +page_size = "0.4.2" +regex = "1" containerd-shim-protos = { path = "../shim-protos", version = "0.2.0" } diff --git a/crates/shim/src/mount.rs b/crates/shim/src/mount.rs index ad38536..d49445d 100644 --- a/crates/shim/src/mount.rs +++ b/crates/shim/src/mount.rs @@ -16,14 +16,21 @@ #![allow(unused)] use std::collections::HashMap; +use std::env; use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, Not}; use std::path::Path; use lazy_static::lazy_static; +use log::error; #[cfg(target_os = "linux")] use nix::mount::{mount, MsFlags}; +#[cfg(target_os = "linux")] +use nix::unistd::{fork, ForkResult}; +use regex::Regex; use crate::error::{Error, Result}; +#[cfg(not(feature = "async"))] +use crate::monitor::{monitor_subscribe, wait_pid, Topic}; #[cfg(target_os = "linux")] struct Flag { @@ -31,6 +38,8 @@ struct Flag { flags: MsFlags, } +const OVERLAY_LOWERDIR_PREFIX: &str = "lowerdir="; + #[cfg(target_os = "linux")] lazy_static! { static ref MOUNT_FLAGS: HashMap<&'static str, Flag> = { @@ -223,6 +232,300 @@ lazy_static! { static ref MS_BIND_RO: MsFlags = MsFlags::MS_BIND.bitor(MsFlags::MS_RDONLY); } +fn options_size(options: &[String]) -> usize { + options.iter().fold(0, |sum, x| sum + x.len()) +} + +fn longest_common_prefix(dirs: &[String]) -> Option { + if dirs.is_empty() { + return None; + } + if dirs.len() == 1 { + if dirs[0].is_empty() { + return None; + } + return Some(dirs[0].to_string()); + } + + let min = dirs.iter().min().unwrap(); + let max = dirs.iter().max().unwrap(); + let min_chars = min.chars().collect::>(); + let max_chars = max.chars().collect::>(); + let mut i = 0; + while i < min_chars.len() && i < max_chars.len() { + if min_chars[i] != max_chars[i] { + if i == 0 { + return None; + } + return Some(min[0..i].to_string()); + } + i += 1; + } + + if min.is_empty() { + None + } else { + Some(min.to_string()) + } +} + +// NOTE: the snapshot id is based on digits. +// in order to avoid to get snapshots/x, shoule be back to parent dir. +// however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots. +#[cfg(target_os = "linux")] +fn trim_flawed_dir(s: &str) -> String { + let r = Regex::new(r"((/[^/]+)+/)([^/]*)").unwrap(); + r.replace(s, "$1").to_string() +} + +#[cfg(target_os = "linux")] +struct LowerdirCompactor { + options: Vec, + lowerdirs: Option>, + lowerdir_prefix: Option, +} + +#[cfg(target_os = "linux")] +impl LowerdirCompactor { + fn new(options: &[String]) -> Self { + Self { + options: options.to_vec(), + lowerdirs: None, + lowerdir_prefix: None, + } + } + + fn lowerdirs(&mut self) -> &mut Self { + self.lowerdirs = Some( + self.options + .iter() + .filter(|x| x.starts_with(OVERLAY_LOWERDIR_PREFIX)) + .map(|x| x.strip_prefix(OVERLAY_LOWERDIR_PREFIX).unwrap_or(x)) + .flat_map(|x| x.split(':')) + .map(str::to_string) + .collect(), + ); + self + } + + fn lowerdir_prefix(&mut self) -> &mut Self { + self.lowerdir_prefix = self + .lowerdirs + .as_ref() + .filter(|x| x.len() > 1) + .map(|x| longest_common_prefix(x)) + .unwrap_or(None) + .filter(|x| x != "/") + .map(|x| trim_flawed_dir(&x)) + .filter(|x| !x.is_empty() && x != "/"); + self + } + + fn compact(&mut self) -> (Option, Vec) { + self.lowerdirs().lowerdir_prefix(); + if let Some(chdir) = &self.lowerdir_prefix { + let lowerdir_str = self + .lowerdirs + .as_ref() + .unwrap_or(&Vec::new()) + .iter() + .map(|x| x.strip_prefix(chdir).unwrap_or(x)) + .collect::>() + .join(":"); + let replace = |x: &str| -> String { + if x.starts_with(OVERLAY_LOWERDIR_PREFIX) { + format!("{}{}", OVERLAY_LOWERDIR_PREFIX, lowerdir_str) + } else { + x.to_string() + } + }; + ( + self.lowerdir_prefix.clone(), + self.options + .iter() + .map(|x| replace(x)) + .collect::>(), + ) + } else { + (None, self.options.to_vec()) + } + } +} + +enum MountExitCode { + NixUnknownErr, + ChdirErr, + Success, + NixOtherErr(i32), +} + +impl From for MountExitCode { + fn from(code: i32) -> Self { + match code { + -2 => MountExitCode::NixUnknownErr, + -1 => MountExitCode::ChdirErr, + 0 => MountExitCode::Success, + _ => MountExitCode::NixOtherErr(code), + } + } +} + +impl From for i32 { + fn from(code: MountExitCode) -> Self { + match code { + MountExitCode::NixUnknownErr => -2, + MountExitCode::ChdirErr => -1, + MountExitCode::Success => 0, + MountExitCode::NixOtherErr(errno) => errno, + } + } +} + +impl From for MountExitCode { + fn from(err: nix::errno::Errno) -> Self { + match err { + nix::errno::Errno::UnknownErrno => MountExitCode::NixUnknownErr, + _ => MountExitCode::NixOtherErr(err as i32), + } + } +} + +impl From for nix::errno::Errno { + fn from(code: MountExitCode) -> Self { + match code { + MountExitCode::NixOtherErr(errno) => nix::errno::Errno::from_i32(errno), + _ => nix::errno::Errno::UnknownErrno, + } + } +} + +impl From for Result<()> { + fn from(code: MountExitCode) -> Self { + match code { + MountExitCode::NixUnknownErr => Err(other!( + "mount process exit unexpectedly, exit code: {}", + nix::errno::Errno::from(code) + )), + MountExitCode::ChdirErr => Err(other!("mount process exit unexpectedly: chdir failed")), + MountExitCode::Success => Ok(()), + MountExitCode::NixOtherErr(errno) => Err(other!( + "mount process exit unexpectedly, exit code: {}", + nix::errno::Errno::from_i32(errno) + )), + } + } +} + +#[cfg(not(feature = "async"))] +#[cfg(target_os = "linux")] +pub fn mount_rootfs( + fs_type: Option<&str>, + source: Option<&str>, + options: &[String], + target: impl AsRef, +) -> Result<()> { + //TODO add helper to mount fuse + let max_size = page_size::get(); + // avoid hitting one page limit of mount argument buffer + // + // NOTE: 512 id a buffer during pagesize check. + let (chdir, options) = + if fs_type.unwrap_or("") == "overlay" && options_size(options) >= max_size - 512 { + LowerdirCompactor::new(options).compact() + } else { + (None, options.to_vec()) + }; + + let mut flags: MsFlags = MsFlags::from_bits(0).unwrap(); + let mut data = Vec::new(); + options.iter().for_each(|x| { + if let Some(f) = MOUNT_FLAGS.get(x.as_str()) { + if f.clear { + flags.bitand_assign(f.flags.not()); + } else { + flags.bitor_assign(f.flags) + } + } else { + data.push(x.as_str()) + } + }); + + let opt = data.join(","); + if opt.len() > max_size { + return Err(other!("mount option is too long")); + } + + let data = if !data.is_empty() { + Some(opt.as_str()) + } else { + None + }; + + let s = monitor_subscribe(Topic::All)?; + match unsafe { fork() } { + Ok(ForkResult::Parent { child, .. }) => { + let code: MountExitCode = wait_pid(i32::from(child), s).into(); + code.into() + } + Ok(ForkResult::Child) => { + if let Some(workdir) = chdir { + env::set_current_dir(Path::new(&workdir)).unwrap_or_else(|_| { + unsafe { libc::_exit(i32::from(MountExitCode::ChdirErr)) }; + }); + } + // mount with non-propagation first, or remount with changed data + let oflags = flags.bitand(PROPAGATION_TYPES.not()); + let zero: MsFlags = MsFlags::from_bits(0).unwrap(); + if flags.bitand(MsFlags::MS_REMOUNT).eq(&zero) || data != None { + mount(source, target.as_ref(), fs_type, oflags, data).unwrap_or_else(|err| { + error!( + "Mount {:?} to {} failed: {}", + source, + target.as_ref().display(), + err + ); + let code: MountExitCode = err.into(); + unsafe { libc::_exit(code.into()) }; + }); + } + // change the propagation type + if flags.bitand(*PROPAGATION_TYPES).ne(&zero) { + mount::(None, target.as_ref(), None, *MS_PROPAGATION, None) + .unwrap_or_else(|err| { + error!( + "Change {} mount propagation faied: {}", + target.as_ref().display(), + err + ); + let code: MountExitCode = err.into(); + unsafe { libc::_exit(code.into()) }; + }); + } + if oflags.bitand(*MS_BIND_RO).eq(&MS_BIND_RO) { + mount::( + None, + target.as_ref(), + None, + oflags.bitor(MsFlags::MS_REMOUNT), + None, + ) + .unwrap_or_else(|err| { + error!( + "Change {} read-only failed: {}", + target.as_ref().display(), + err + ); + let code: MountExitCode = err.into(); + unsafe { libc::_exit(code.into()) }; + }); + } + unsafe { libc::_exit(i32::from(MountExitCode::Success)) }; + } + Err(_) => Err(other!("fork mount process failed")), + } +} + +#[cfg(feature = "async")] #[cfg(target_os = "linux")] pub fn mount_rootfs( fs_type: Option<&str>, @@ -299,3 +602,111 @@ pub fn mount_rootfs( ) -> Result<()> { Err(Error::Unimplemented("start".to_string())) } + +#[cfg(test)] +#[cfg(target_os = "linux")] +mod tests { + use super::*; + + #[test] + fn test_trim_flawed_dir() { + let mut tcases: Vec<(&str, String)> = Vec::new(); + tcases.push(("/.foo-_bar/foo", "/.foo-_bar/".to_string())); + tcases.push(("/.foo-_bar/foo/", "/.foo-_bar/foo/".to_string())); + tcases.push(("/.foo-_bar/foo/bar", "/.foo-_bar/foo/".to_string())); + tcases.push(("/.foo-_bar/foo/bar/", "/.foo-_bar/foo/bar/".to_string())); + for (case, expected) in tcases { + let res = trim_flawed_dir(case); + assert_eq!(res, expected); + } + } + + #[test] + fn test_longest_common_prefix() { + let mut tcases: Vec<(Vec, Option)> = Vec::new(); + tcases.push((vec![], None)); + tcases.push((vec!["foo".to_string()], Some("foo".to_string()))); + tcases.push((vec!["foo".to_string(), "bar".to_string()], None)); + tcases.push(( + vec!["foo".to_string(), "foo".to_string()], + Some("foo".to_string()), + )); + tcases.push(( + vec!["foo".to_string(), "foobar".to_string()], + Some("foo".to_string()), + )); + tcases.push(( + vec!["foo".to_string(), "".to_string(), "foobar".to_string()], + None, + )); + for (case, expected) in tcases { + let res = longest_common_prefix(&case); + assert_eq!(res, expected); + } + } + + #[test] + fn test_compact_lowerdir_option() { + let mut tcases: Vec<(Vec, Option, Vec)> = Vec::new(); + tcases.push(( + vec!["workdir=a".to_string()], + None, + vec!["workdir=a".to_string()], + )); + tcases.push(( + vec!["workdir=a".to_string(), "lowerdir=b".to_string()], + None, + vec!["workdir=a".to_string(), "lowerdir=b".to_string()], + )); + tcases.push(( + vec!["lowerdir=/snapshots/1/fs:/snapshots/10/fs".to_string()], + Some("/snapshots/".to_string()), + vec!["lowerdir=1/fs:10/fs".to_string()], + )); + tcases.push(( + vec![ + "workdir=a".to_string(), + "lowerdir=/snapshots/1/fs:/snapshots/10/fs".to_string(), + ], + Some("/snapshots/".to_string()), + vec!["workdir=a".to_string(), "lowerdir=1/fs:10/fs".to_string()], + )); + tcases.push(( + vec!["lowerdir=/snapshots/1/fs:/snapshots/10/fs:/snapshots/2/fs".to_string()], + Some("/snapshots/".to_string()), + vec!["lowerdir=1/fs:10/fs:2/fs".to_string()], + )); + tcases.push(( + vec![ + "workdir=a".to_string(), + "lowerdir=/snapshots/1/fs:/snapshots/10/fs:/snapshots/2/fs".to_string(), + ], + Some("/snapshots/".to_string()), + vec![ + "workdir=a".to_string(), + "lowerdir=1/fs:10/fs:2/fs".to_string(), + ], + )); + tcases.push(( + vec!["lowerdir=/snapshots/1/fs:/other_snapshots/1/fs".to_string()], + None, + vec!["lowerdir=/snapshots/1/fs:/other_snapshots/1/fs".to_string()], + )); + tcases.push(( + vec![ + "workdir=a".to_string(), + "lowerdir=/snapshots/1/fs:/other_snapshots/1/fs".to_string(), + ], + None, + vec![ + "workdir=a".to_string(), + "lowerdir=/snapshots/1/fs:/other_snapshots/1/fs".to_string(), + ], + )); + for (case, expected_chdir, expected_options) in tcases { + let (chdir, options) = LowerdirCompactor::new(&case).compact(); + assert_eq!(chdir, expected_chdir); + assert_eq!(options, expected_options); + } + } +} diff --git a/crates/shim/src/synchronous/monitor.rs b/crates/shim/src/synchronous/monitor.rs index ef6feef..5f2f49d 100644 --- a/crates/shim/src/synchronous/monitor.rs +++ b/crates/shim/src/synchronous/monitor.rs @@ -135,3 +135,17 @@ impl Drop for Subscription { }); } } + +pub fn wait_pid(pid: i32, s: Subscription) -> i32 { + loop { + if let Ok(ExitEvent { + subject: Subject::Pid(epid), + exit_code: code, + }) = s.rx.recv() + { + if pid == epid { + return code; + } + } + } +}