Fix incorrect process_cpu_seconds_total metric (#7)

Fixes linkerd/linkerd2#1239.

The proxy's `process_cpu_seconds_total` metric is currently calculated
incorrectly and will differ from the CPU stats reported by other 
sources. This is because it currently calculates the CPU time by summing
the `utime` and `stime` fields of the stat struct returned by `procinfo`.
However, those numbers are expressed in _clock ticks_, not seconds, so
the metric is expressed in the wrong unit.

This branch fixes this issue by using `sysconf` to get the number of
clock ticks per second when the process sensor is created, and then
dividing `utime + stime` by that number, so that the value is expressed
in seconds.

## Demonstration:

(Note that the last column in `ps aux` output is the CPU time total)
```
eliza@ares:~$ ps aux | grep linkerd2-proxy | grep -v grep
eliza    40703  0.2  0.0  45580 14864 pts/0    Sl+  13:49   0:03 target/debug/linkerd2-proxy
eliza@ares:~$ curl localhost:4191/metrics
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 3
# HELP process_open_fds Number of open file descriptors.
# TYPE process_open_fds gauge
process_open_fds 19
# HELP process_max_fds Maximum number of open file descriptors.
# TYPE process_max_fds gauge
process_max_fds 1024
# HELP process_virtual_memory_bytes Virtual memory size in bytes.
# TYPE process_virtual_memory_bytes gauge
process_virtual_memory_bytes 46673920
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 15220736
# HELP process_start_time_seconds Time that the process started (in seconds since the UNIX epoch)
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1531428584
eliza@ares:~$
```

Signed-off-by: Eliza Weisman <eliza@buoyant.io>
This commit is contained in:
Eliza Weisman 2018-07-17 15:48:02 -07:00 committed by GitHub
parent 1fdcbfaba6
commit f208acb3a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 19 additions and 11 deletions

View File

@ -70,21 +70,28 @@ mod imp {
#[derive(Debug)]
pub struct Sensor {
page_size: usize,
page_size: u64,
clock_ticks_per_sec: u64,
}
fn sysconf(num: libc::c_int, name: &'static str) -> Result<u64, io::Error> {
match unsafe { libc::sysconf(num) } {
e if e <= 0 => {
let error = io::Error::last_os_error();
error!("error getting {}: {:?}", name, error);
Err(error)
},
val => Ok(val as u64),
}
}
impl Sensor {
pub fn new() -> io::Result<Sensor> {
let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } {
e if e < 0 => {
let error = io::Error::last_os_error();
error!("error getting page size: {:?}", error);
return Err(error);
},
page_size => page_size as usize,
};
let page_size = sysconf(libc::_SC_PAGESIZE, "page size")?;
let clock_ticks_per_sec = sysconf(libc::_SC_CLK_TCK, "clock ticks per second")?;
Ok(Sensor {
page_size,
clock_ticks_per_sec,
})
}
@ -92,9 +99,10 @@ mod imp {
// XXX potentially blocking call
let stat = pid::stat_self()?;
let cpu_seconds_total = Counter::from((stat.utime + stat.stime) as u64);
let clock_ticks = stat.utime as u64 + stat.stime as u64;
let cpu_seconds_total = Counter::from(clock_ticks / self.clock_ticks_per_sec);
let virtual_memory_bytes = Gauge::from(stat.vsize as u64);
let resident_memory_bytes = Gauge::from((stat.rss * self.page_size) as u64);
let resident_memory_bytes = Gauge::from(stat.rss as u64 * self.page_size);
let metrics = ProcessMetrics {
cpu_seconds_total,