feat: set THP_DISABLE=true in shim, and restore it before starting runc

If /sys/kernel/mm/transparent_hugepage/enabled=always, the shim process
will use huge pages, which will consume a lot of memory.

Just like this:
ps -efo pid,rss,comm | grep shim
    PID   RSS COMMAND
   2614  7464 containerd-shim

I don't think shim needs to use huge pages, and if we turn off the huge
pages option, we can save a lot of memory resources.

After we set THP_DISABLE=true:
ps -efo pid,comm,rss
    PID COMMAND           RSS
1629841 containerd-shim  5648

containerd
    |
    |--shim1   --start
        |
        |--shim2    (this shim will on host)
            |
            |--runc create (when containerd send create request by ttrpc)
                |
                |--runc init (this is the pid 1 in container)

    we should set thp_disabled=1 in shim1 --start, because if we set this
    in shim 2, the huge page has been setted while func main() running,
    we set thp_disabled cannot change the setted huge pages.
    So We need to set thp_disabled=1 in shim1 so that shim2 inherits the
    settings of the parent process shim1, and shim2 has closed the
    hugepage when it starts.

    For runc processes, we need to set thp_disabled='before' in shim2 after
    fork() and before execve(). So we use cmd.pre_exec to do this.
This commit is contained in:
zhang yu 10307750 2023-09-11 11:49:13 +08:00 committed by Fu Wei
parent 1b2a74aa8a
commit 3a7b9ce173
5 changed files with 38 additions and 2 deletions

View File

@ -31,6 +31,7 @@ log = {version = "0.4.2", features=["kv_unstable"]}
nix = "0.27"
oci-spec = "0.6"
os_pipe = "1.1"
prctl = "1.0.0"
prost = "0.12"
prost-types = "0.12"
serde = { version = "1.0", features = ["derive"] }

View File

@ -31,6 +31,7 @@ libc.workspace = true
log.workspace = true
nix = { workspace = true, features = ["socket", "uio", "term"] }
oci-spec.workspace = true
prctl.workspace = true
runc = { path = "../runc", version = "0.2.0", features = ["async"] }
serde.workspace = true
serde_json.workspace = true

View File

@ -78,8 +78,27 @@ impl Shim for Service {
}
None => {}
}
#[cfg(not(target_os = "linux"))]
let thp_disabled = String::new();
#[cfg(target_os = "linux")]
// Our goal is to set thp disable = true on the shim side and then restore thp
// disable before starting runc. So we only need to focus on the return value
// of the function get_thp_disabled, which is Result<bool, i32>.
let thp_disabled = match prctl::get_thp_disable() {
Ok(x) => {
// The return value of the function set_thp_disabled is Result<(), i32>,
// we don't care if the setting is successful, because even if the
// setting failed, we should not exit the shim process, therefore,
// there is no need to pay attention to the set_thp_disabled function's
// return value.
let _ = prctl::set_thp_disable(true);
x.to_string()
}
Err(_) => String::new(),
};
let vars: Vec<(&str, &str)> = vec![("THP_DISABLED", thp_disabled.as_str())];
let address = spawn(opts, &grouping, Vec::new()).await?;
let address = spawn(opts, &grouping, vars).await?;
write_str_to_file("address", &address).await?;
Ok(address)
}

View File

@ -22,6 +22,7 @@ nix = { workspace = true, features = ["user", "fs"] }
oci-spec.workspace = true
os_pipe.workspace = true
path-absolutize = "3.0.11"
prctl.workspace = true
rand = "0.8.4"
serde.workspace = true
serde_json.workspace = true

View File

@ -368,8 +368,22 @@ pub trait Spawner: Debug {
/// and some other utilities.
#[cfg(feature = "async")]
impl Runc {
async fn launch(&self, cmd: Command, combined_output: bool) -> Result<Response> {
async fn launch(&self, mut cmd: Command, combined_output: bool) -> Result<Response> {
debug!("Execute command {:?}", cmd);
unsafe {
cmd.pre_exec(move || {
#[cfg(target_os = "linux")]
if let Ok(thp) = std::env::var("THP_DISABLED") {
if let Ok(thp_disabled) = thp.parse::<bool>() {
if let Err(e) = prctl::set_thp_disable(thp_disabled) {
debug!("set_thp_disable err: {}", e);
};
}
}
Ok(())
});
}
let (status, pid, stdout, stderr) = self.spawner.execute(cmd).await?;
if status.success() {
let output = if combined_output {