feat: add cgroup v2 support for metrics
This commits adds cgroup v2 support for collecting metrics in the shim. Additionally, it uses CPU controller instead of the CPUAcct controller for reporting CPU metrics back to containerd. Signed-off-by: jiaxiao zhou <jiazho@microsoft.com>
This commit is contained in:
parent
5c55e352de
commit
39532a3c00
|
|
@ -22,7 +22,7 @@ use cgroups_rs::{
|
|||
cgroup::get_cgroups_relative_paths_by_pid, hierarchies, Cgroup, CgroupPid, MaxValue, Subsystem,
|
||||
};
|
||||
use containerd_shim_protos::{
|
||||
cgroups::metrics::{CPUStat, CPUUsage, MemoryEntry, MemoryStat, Metrics},
|
||||
cgroups::metrics::{CPUStat, CPUUsage, MemoryEntry, MemoryStat, Metrics, PidsStat, Throttle},
|
||||
protobuf::{well_known_types::any::Any, Message},
|
||||
shim::oci::Options,
|
||||
};
|
||||
|
|
@ -96,20 +96,68 @@ fn write_process_oom_score(pid: u32, score: i64) -> Result<()> {
|
|||
/// Collect process cgroup stats, return only necessary parts of it
|
||||
pub fn collect_metrics(pid: u32) -> Result<Metrics> {
|
||||
let mut metrics = Metrics::new();
|
||||
// get container main process cgroup
|
||||
let path =
|
||||
get_cgroups_relative_paths_by_pid(pid).map_err(other_error!(e, "get process cgroup"))?;
|
||||
let cgroup = Cgroup::load_with_relative_paths(hierarchies::auto(), Path::new("."), path);
|
||||
|
||||
let hierarchies = hierarchies::auto();
|
||||
let cgroup = if hierarchies.v2() {
|
||||
let path = format!("/proc/{}/cgroup", pid);
|
||||
let content = fs::read_to_string(path).map_err(io_error!(e, "read cgroup"))?;
|
||||
let content = content.strip_suffix('\n').unwrap_or_default();
|
||||
|
||||
let parts: Vec<&str> = content.split("::").collect();
|
||||
let path_parts: Vec<&str> = parts[1].split('/').collect();
|
||||
let namespace = path_parts[1];
|
||||
let cgroup_name = path_parts[2];
|
||||
Cgroup::load(
|
||||
hierarchies,
|
||||
format!("/sys/fs/cgroup/{namespace}/{cgroup_name}").as_str(),
|
||||
)
|
||||
} else {
|
||||
// get container main process cgroup
|
||||
let path = get_cgroups_relative_paths_by_pid(pid)
|
||||
.map_err(other_error!(e, "get process cgroup"))?;
|
||||
Cgroup::load_with_relative_paths(hierarchies::auto(), Path::new("."), path)
|
||||
};
|
||||
|
||||
// to make it easy, fill the necessary metrics only.
|
||||
for sub_system in Cgroup::subsystems(&cgroup) {
|
||||
match sub_system {
|
||||
Subsystem::CpuAcct(cpuacct_ctr) => {
|
||||
Subsystem::Cpu(cpu_ctr) => {
|
||||
let mut cpu_usage = CPUUsage::new();
|
||||
cpu_usage.set_total(cpuacct_ctr.cpuacct().usage);
|
||||
let mut cpu_stat = CPUStat::new();
|
||||
cpu_stat.set_usage(cpu_usage);
|
||||
metrics.set_cpu(cpu_stat);
|
||||
let mut throttle = Throttle::new();
|
||||
let stat = cpu_ctr.cpu().stat;
|
||||
for line in stat.lines() {
|
||||
let parts = line.split(' ').collect::<Vec<&str>>();
|
||||
if parts.len() != 2 {
|
||||
Err(Error::Other(format!("invalid cpu stat line: {}", line)))?;
|
||||
}
|
||||
|
||||
// https://github.com/opencontainers/runc/blob/dbe8434359ca35af1c1e10df42b1f4391c1e1010/libcontainer/cgroups/fs2/cpu.go#L70
|
||||
match parts[0] {
|
||||
"usage_usec" => {
|
||||
cpu_usage.set_total(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
"user_usec" => {
|
||||
cpu_usage.set_user(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
"system_usec" => {
|
||||
cpu_usage.set_kernel(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
"nr_periods" => {
|
||||
throttle.set_periods(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
"nr_throttled" => {
|
||||
throttle.set_throttled_periods(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
"throttled_usec" => {
|
||||
throttle.set_throttled_time(parts[1].parse::<u64>().unwrap());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut cpu_stats = CPUStat::new();
|
||||
cpu_stats.set_throttling(throttle);
|
||||
cpu_stats.set_usage(cpu_usage);
|
||||
metrics.set_cpu(cpu_stats);
|
||||
}
|
||||
Subsystem::Mem(mem_ctr) => {
|
||||
let mem = mem_ctr.memory_stat();
|
||||
|
|
@ -120,6 +168,25 @@ pub fn collect_metrics(pid: u32) -> Result<Metrics> {
|
|||
mem_stat.set_total_inactive_file(mem.stat.total_inactive_file);
|
||||
metrics.set_memory(mem_stat);
|
||||
}
|
||||
Subsystem::Pid(pid_ctr) => {
|
||||
let mut pid_stats = PidsStat::new();
|
||||
pid_stats.set_current(
|
||||
pid_ctr
|
||||
.get_pid_current()
|
||||
.map_err(other_error!(e, "get current pid"))?,
|
||||
);
|
||||
pid_stats.set_limit(
|
||||
pid_ctr
|
||||
.get_pid_max()
|
||||
.map(|val| match val {
|
||||
// See https://github.com/opencontainers/runc/blob/dbe8434359ca35af1c1e10df42b1f4391c1e1010/libcontainer/cgroups/fs/pids.go#L55
|
||||
cgroups_rs::MaxValue::Max => 0,
|
||||
cgroups_rs::MaxValue::Value(val) => val as u64,
|
||||
})
|
||||
.map_err(other_error!(e, "get pid limit"))?,
|
||||
);
|
||||
metrics.set_pids(pid_stats)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue