From f208acb3a5f7d6dd86e361ec0e5222a4a035f9ed Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jul 2018 15:48:02 -0700 Subject: [PATCH] Fix incorrect process_cpu_seconds_total metric (#7) Fixes linkerd/linkerd2#1239. The proxy's `process_cpu_seconds_total` metric is currently calculated incorrectly and will differ from the CPU stats reported by other sources. This is because it currently calculates the CPU time by summing the `utime` and `stime` fields of the stat struct returned by `procinfo`. However, those numbers are expressed in _clock ticks_, not seconds, so the metric is expressed in the wrong unit. This branch fixes this issue by using `sysconf` to get the number of clock ticks per second when the process sensor is created, and then dividing `utime + stime` by that number, so that the value is expressed in seconds. ## Demonstration: (Note that the last column in `ps aux` output is the CPU time total) ``` eliza@ares:~$ ps aux | grep linkerd2-proxy | grep -v grep eliza 40703 0.2 0.0 45580 14864 pts/0 Sl+ 13:49 0:03 target/debug/linkerd2-proxy eliza@ares:~$ curl localhost:4191/metrics # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter process_cpu_seconds_total 3 # HELP process_open_fds Number of open file descriptors. # TYPE process_open_fds gauge process_open_fds 19 # HELP process_max_fds Maximum number of open file descriptors. # TYPE process_max_fds gauge process_max_fds 1024 # HELP process_virtual_memory_bytes Virtual memory size in bytes. # TYPE process_virtual_memory_bytes gauge process_virtual_memory_bytes 46673920 # HELP process_resident_memory_bytes Resident memory size in bytes. # TYPE process_resident_memory_bytes gauge process_resident_memory_bytes 15220736 # HELP process_start_time_seconds Time that the process started (in seconds since the UNIX epoch) # TYPE process_start_time_seconds gauge process_start_time_seconds 1531428584 eliza@ares:~$ ``` Signed-off-by: Eliza Weisman --- src/telemetry/metrics/process.rs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/telemetry/metrics/process.rs b/src/telemetry/metrics/process.rs index 30e3a74b8..eeaa0da2a 100644 --- a/src/telemetry/metrics/process.rs +++ b/src/telemetry/metrics/process.rs @@ -70,21 +70,28 @@ mod imp { #[derive(Debug)] pub struct Sensor { - page_size: usize, + page_size: u64, + clock_ticks_per_sec: u64, + } + + fn sysconf(num: libc::c_int, name: &'static str) -> Result { + match unsafe { libc::sysconf(num) } { + e if e <= 0 => { + let error = io::Error::last_os_error(); + error!("error getting {}: {:?}", name, error); + Err(error) + }, + val => Ok(val as u64), + } } impl Sensor { pub fn new() -> io::Result { - let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } { - e if e < 0 => { - let error = io::Error::last_os_error(); - error!("error getting page size: {:?}", error); - return Err(error); - }, - page_size => page_size as usize, - }; + let page_size = sysconf(libc::_SC_PAGESIZE, "page size")?; + let clock_ticks_per_sec = sysconf(libc::_SC_CLK_TCK, "clock ticks per second")?; Ok(Sensor { page_size, + clock_ticks_per_sec, }) } @@ -92,9 +99,10 @@ mod imp { // XXX potentially blocking call let stat = pid::stat_self()?; - let cpu_seconds_total = Counter::from((stat.utime + stat.stime) as u64); + let clock_ticks = stat.utime as u64 + stat.stime as u64; + let cpu_seconds_total = Counter::from(clock_ticks / self.clock_ticks_per_sec); let virtual_memory_bytes = Gauge::from(stat.vsize as u64); - let resident_memory_bytes = Gauge::from((stat.rss * self.page_size) as u64); + let resident_memory_bytes = Gauge::from(stat.rss as u64 * self.page_size); let metrics = ProcessMetrics { cpu_seconds_total,