Fix incorrect process_cpu_seconds_total metric (#7)
Fixes linkerd/linkerd2#1239. The proxy's `process_cpu_seconds_total` metric is currently calculated incorrectly and will differ from the CPU stats reported by other sources. This is because it currently calculates the CPU time by summing the `utime` and `stime` fields of the stat struct returned by `procinfo`. However, those numbers are expressed in _clock ticks_, not seconds, so the metric is expressed in the wrong unit. This branch fixes this issue by using `sysconf` to get the number of clock ticks per second when the process sensor is created, and then dividing `utime + stime` by that number, so that the value is expressed in seconds. ## Demonstration: (Note that the last column in `ps aux` output is the CPU time total) ``` eliza@ares:~$ ps aux | grep linkerd2-proxy | grep -v grep eliza 40703 0.2 0.0 45580 14864 pts/0 Sl+ 13:49 0:03 target/debug/linkerd2-proxy eliza@ares:~$ curl localhost:4191/metrics # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter process_cpu_seconds_total 3 # HELP process_open_fds Number of open file descriptors. # TYPE process_open_fds gauge process_open_fds 19 # HELP process_max_fds Maximum number of open file descriptors. # TYPE process_max_fds gauge process_max_fds 1024 # HELP process_virtual_memory_bytes Virtual memory size in bytes. # TYPE process_virtual_memory_bytes gauge process_virtual_memory_bytes 46673920 # HELP process_resident_memory_bytes Resident memory size in bytes. # TYPE process_resident_memory_bytes gauge process_resident_memory_bytes 15220736 # HELP process_start_time_seconds Time that the process started (in seconds since the UNIX epoch) # TYPE process_start_time_seconds gauge process_start_time_seconds 1531428584 eliza@ares:~$ ``` Signed-off-by: Eliza Weisman <eliza@buoyant.io>
This commit is contained in:
parent
1fdcbfaba6
commit
f208acb3a5
|
@ -70,21 +70,28 @@ mod imp {
|
|||
|
||||
#[derive(Debug)]
|
||||
pub struct Sensor {
|
||||
page_size: usize,
|
||||
page_size: u64,
|
||||
clock_ticks_per_sec: u64,
|
||||
}
|
||||
|
||||
fn sysconf(num: libc::c_int, name: &'static str) -> Result<u64, io::Error> {
|
||||
match unsafe { libc::sysconf(num) } {
|
||||
e if e <= 0 => {
|
||||
let error = io::Error::last_os_error();
|
||||
error!("error getting {}: {:?}", name, error);
|
||||
Err(error)
|
||||
},
|
||||
val => Ok(val as u64),
|
||||
}
|
||||
}
|
||||
|
||||
impl Sensor {
|
||||
pub fn new() -> io::Result<Sensor> {
|
||||
let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } {
|
||||
e if e < 0 => {
|
||||
let error = io::Error::last_os_error();
|
||||
error!("error getting page size: {:?}", error);
|
||||
return Err(error);
|
||||
},
|
||||
page_size => page_size as usize,
|
||||
};
|
||||
let page_size = sysconf(libc::_SC_PAGESIZE, "page size")?;
|
||||
let clock_ticks_per_sec = sysconf(libc::_SC_CLK_TCK, "clock ticks per second")?;
|
||||
Ok(Sensor {
|
||||
page_size,
|
||||
clock_ticks_per_sec,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -92,9 +99,10 @@ mod imp {
|
|||
// XXX potentially blocking call
|
||||
let stat = pid::stat_self()?;
|
||||
|
||||
let cpu_seconds_total = Counter::from((stat.utime + stat.stime) as u64);
|
||||
let clock_ticks = stat.utime as u64 + stat.stime as u64;
|
||||
let cpu_seconds_total = Counter::from(clock_ticks / self.clock_ticks_per_sec);
|
||||
let virtual_memory_bytes = Gauge::from(stat.vsize as u64);
|
||||
let resident_memory_bytes = Gauge::from((stat.rss * self.page_size) as u64);
|
||||
let resident_memory_bytes = Gauge::from(stat.rss as u64 * self.page_size);
|
||||
|
||||
let metrics = ProcessMetrics {
|
||||
cpu_seconds_total,
|
||||
|
|
Loading…
Reference in New Issue