feat: add cgroup v2 support for metrics
This commits adds cgroup v2 support for collecting metrics in the shim. Additionally, it uses CPU controller instead of the CPUAcct controller for reporting CPU metrics back to containerd. Signed-off-by: jiaxiao zhou <jiazho@microsoft.com>
This commit is contained in:
parent
5c55e352de
commit
39532a3c00
|
|
@ -22,7 +22,7 @@ use cgroups_rs::{
|
||||||
cgroup::get_cgroups_relative_paths_by_pid, hierarchies, Cgroup, CgroupPid, MaxValue, Subsystem,
|
cgroup::get_cgroups_relative_paths_by_pid, hierarchies, Cgroup, CgroupPid, MaxValue, Subsystem,
|
||||||
};
|
};
|
||||||
use containerd_shim_protos::{
|
use containerd_shim_protos::{
|
||||||
cgroups::metrics::{CPUStat, CPUUsage, MemoryEntry, MemoryStat, Metrics},
|
cgroups::metrics::{CPUStat, CPUUsage, MemoryEntry, MemoryStat, Metrics, PidsStat, Throttle},
|
||||||
protobuf::{well_known_types::any::Any, Message},
|
protobuf::{well_known_types::any::Any, Message},
|
||||||
shim::oci::Options,
|
shim::oci::Options,
|
||||||
};
|
};
|
||||||
|
|
@ -96,20 +96,68 @@ fn write_process_oom_score(pid: u32, score: i64) -> Result<()> {
|
||||||
/// Collect process cgroup stats, return only necessary parts of it
|
/// Collect process cgroup stats, return only necessary parts of it
|
||||||
pub fn collect_metrics(pid: u32) -> Result<Metrics> {
|
pub fn collect_metrics(pid: u32) -> Result<Metrics> {
|
||||||
let mut metrics = Metrics::new();
|
let mut metrics = Metrics::new();
|
||||||
// get container main process cgroup
|
|
||||||
let path =
|
let hierarchies = hierarchies::auto();
|
||||||
get_cgroups_relative_paths_by_pid(pid).map_err(other_error!(e, "get process cgroup"))?;
|
let cgroup = if hierarchies.v2() {
|
||||||
let cgroup = Cgroup::load_with_relative_paths(hierarchies::auto(), Path::new("."), path);
|
let path = format!("/proc/{}/cgroup", pid);
|
||||||
|
let content = fs::read_to_string(path).map_err(io_error!(e, "read cgroup"))?;
|
||||||
|
let content = content.strip_suffix('\n').unwrap_or_default();
|
||||||
|
|
||||||
|
let parts: Vec<&str> = content.split("::").collect();
|
||||||
|
let path_parts: Vec<&str> = parts[1].split('/').collect();
|
||||||
|
let namespace = path_parts[1];
|
||||||
|
let cgroup_name = path_parts[2];
|
||||||
|
Cgroup::load(
|
||||||
|
hierarchies,
|
||||||
|
format!("/sys/fs/cgroup/{namespace}/{cgroup_name}").as_str(),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
// get container main process cgroup
|
||||||
|
let path = get_cgroups_relative_paths_by_pid(pid)
|
||||||
|
.map_err(other_error!(e, "get process cgroup"))?;
|
||||||
|
Cgroup::load_with_relative_paths(hierarchies::auto(), Path::new("."), path)
|
||||||
|
};
|
||||||
|
|
||||||
// to make it easy, fill the necessary metrics only.
|
// to make it easy, fill the necessary metrics only.
|
||||||
for sub_system in Cgroup::subsystems(&cgroup) {
|
for sub_system in Cgroup::subsystems(&cgroup) {
|
||||||
match sub_system {
|
match sub_system {
|
||||||
Subsystem::CpuAcct(cpuacct_ctr) => {
|
Subsystem::Cpu(cpu_ctr) => {
|
||||||
let mut cpu_usage = CPUUsage::new();
|
let mut cpu_usage = CPUUsage::new();
|
||||||
cpu_usage.set_total(cpuacct_ctr.cpuacct().usage);
|
let mut throttle = Throttle::new();
|
||||||
let mut cpu_stat = CPUStat::new();
|
let stat = cpu_ctr.cpu().stat;
|
||||||
cpu_stat.set_usage(cpu_usage);
|
for line in stat.lines() {
|
||||||
metrics.set_cpu(cpu_stat);
|
let parts = line.split(' ').collect::<Vec<&str>>();
|
||||||
|
if parts.len() != 2 {
|
||||||
|
Err(Error::Other(format!("invalid cpu stat line: {}", line)))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://github.com/opencontainers/runc/blob/dbe8434359ca35af1c1e10df42b1f4391c1e1010/libcontainer/cgroups/fs2/cpu.go#L70
|
||||||
|
match parts[0] {
|
||||||
|
"usage_usec" => {
|
||||||
|
cpu_usage.set_total(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
"user_usec" => {
|
||||||
|
cpu_usage.set_user(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
"system_usec" => {
|
||||||
|
cpu_usage.set_kernel(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
"nr_periods" => {
|
||||||
|
throttle.set_periods(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
"nr_throttled" => {
|
||||||
|
throttle.set_throttled_periods(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
"throttled_usec" => {
|
||||||
|
throttle.set_throttled_time(parts[1].parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut cpu_stats = CPUStat::new();
|
||||||
|
cpu_stats.set_throttling(throttle);
|
||||||
|
cpu_stats.set_usage(cpu_usage);
|
||||||
|
metrics.set_cpu(cpu_stats);
|
||||||
}
|
}
|
||||||
Subsystem::Mem(mem_ctr) => {
|
Subsystem::Mem(mem_ctr) => {
|
||||||
let mem = mem_ctr.memory_stat();
|
let mem = mem_ctr.memory_stat();
|
||||||
|
|
@ -120,6 +168,25 @@ pub fn collect_metrics(pid: u32) -> Result<Metrics> {
|
||||||
mem_stat.set_total_inactive_file(mem.stat.total_inactive_file);
|
mem_stat.set_total_inactive_file(mem.stat.total_inactive_file);
|
||||||
metrics.set_memory(mem_stat);
|
metrics.set_memory(mem_stat);
|
||||||
}
|
}
|
||||||
|
Subsystem::Pid(pid_ctr) => {
|
||||||
|
let mut pid_stats = PidsStat::new();
|
||||||
|
pid_stats.set_current(
|
||||||
|
pid_ctr
|
||||||
|
.get_pid_current()
|
||||||
|
.map_err(other_error!(e, "get current pid"))?,
|
||||||
|
);
|
||||||
|
pid_stats.set_limit(
|
||||||
|
pid_ctr
|
||||||
|
.get_pid_max()
|
||||||
|
.map(|val| match val {
|
||||||
|
// See https://github.com/opencontainers/runc/blob/dbe8434359ca35af1c1e10df42b1f4391c1e1010/libcontainer/cgroups/fs/pids.go#L55
|
||||||
|
cgroups_rs::MaxValue::Max => 0,
|
||||||
|
cgroups_rs::MaxValue::Value(val) => val as u64,
|
||||||
|
})
|
||||||
|
.map_err(other_error!(e, "get pid limit"))?,
|
||||||
|
);
|
||||||
|
metrics.set_pids(pid_stats)
|
||||||
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue