From 2f0ad6625736b14d9732105ba084c47872ec05fb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 21 Jun 2018 17:50:10 -0700 Subject: [PATCH] proxy: Add process stats to proxy metrics (on Linux) (#1128) This branch adds process stats to the proxy's metrics, as described in https://prometheus.io/docs/instrumenting/writing_clientlibs/#process-metrics. In particular, it adds metrics for the process's total CPU time, number of open file descriptors and max file descriptors, virtual memory size, and resident set size. This branch adds a dependency on the `procinfo` crate. Since this crate and the syscalls it wraps are Linux-specific, these stats are only reported on Linux. On other operating systems, they aren't reported. Manual testing Metrics scrape: ``` eliza@ares:~$ curl http://localhost:4191/metrics # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter process_cpu_seconds_total 0 # HELP process_open_fds Number of open file descriptors. # TYPE process_open_fds gauge process_open_fds 19 # HELP process_max_fds Maximum number of open file descriptors. # TYPE process_max_fds gauge process_max_fds 1024 # HELP process_virtual_memory_bytes Virtual memory size in bytes. # TYPE process_virtual_memory_bytes gauge process_virtual_memory_bytes 45252608 # HELP process_resident_memory_bytes Resident memory size in bytes. # TYPE process_resident_memory_bytes gauge process_resident_memory_bytes 12132352 # HELP process_start_time_seconds Time that the process started (in seconds since the UNIX epoch) # TYPE process_start_time_seconds gauge process_start_time_seconds 1529017536 ``` Note that the `process_cpu_seconds_total` stat is 0 because I just launched this conduit instance and it's not seeing any load; it does go up after i sent a few requests to it. Confirm RSS & virtual memory stats w/ `ps`, and get Conduit's pid so we can check the fd stats (note that `ps` reports virt/rss in kb while Conduit's metrics reports them in bytes): ``` eliza@ares:~$ ps aux | grep conduit | grep -v grep eliza 16766 0.0 0.0 44192 12956 pts/2 Sl+ 16:05 0:00 target/debug/conduit-proxy ``` Count conduit process's open fds: ``` eliza@ares:~$ cd /proc/16766/fd eliza@ares:/proc/16766/fd$ ls -l | wc -l 18 ``` Signed-off-by: Eliza Weisman --- Cargo.lock | 43 +++++++ proxy/Cargo.toml | 1 + proxy/src/lib.rs | 2 + proxy/src/telemetry/metrics/counter.rs | 6 + proxy/src/telemetry/metrics/mod.rs | 15 +++ proxy/src/telemetry/metrics/process.rs | 151 +++++++++++++++++++++++++ 6 files changed, 218 insertions(+) create mode 100644 proxy/src/telemetry/metrics/process.rs diff --git a/Cargo.lock b/Cargo.lock index 6b925bcb1..9327094d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -141,6 +141,7 @@ dependencies = [ "libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", "net2 0.2.32 (registry+https://github.com/rust-lang/crates.io-index)", + "procinfo 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "prost 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "prost-types 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "quickcheck 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -648,6 +649,11 @@ name = "nodrop" version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "nom" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "num" version = "0.1.42" @@ -717,6 +723,17 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "procinfo" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)", + "nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "prost" version = "0.4.0" @@ -885,6 +902,14 @@ name = "rustc-demangle" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rustc_version" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rustls" version = "0.12.0" @@ -917,6 +942,19 @@ dependencies = [ "untrusted 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "slab" version = "0.4.0" @@ -1562,6 +1600,7 @@ dependencies = [ "checksum multimap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2eb04b9f127583ed176e163fb9ec6f3e793b87e21deedd5734a69386a18a0151" "checksum net2 0.2.32 (registry+https://github.com/rust-lang/crates.io-index)" = "9044faf1413a1057267be51b5afba8eb1090bd2231c693664aa1db716fe1eae0" "checksum nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "9a2228dca57108069a5262f2ed8bd2e82496d2e074a06d1ccc7ce1687b6ae0a2" +"checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff" "checksum num 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "4703ad64153382334aa8db57c637364c322d3372e097840c72000dabdcf6156e" "checksum num-integer 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "f8d26da319fb45674985c78f1d1caf99aa4941f785d384a2ae36d0740bc3e2fe" "checksum num-iter 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)" = "4b226df12c5a59b63569dd57fafb926d91b385dfce33d8074a412411b689d593" @@ -1571,6 +1610,7 @@ dependencies = [ "checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831" "checksum petgraph 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)" = "7a7e5234c228fbfa874c86a77f685886127f82e0aef602ad1d48333fcac6ad61" "checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" +"checksum procinfo 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6ab1427f3d2635891f842892dda177883dca0639e05fe66796a62c9d2f23b49c" "checksum prost 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b9f36c478cd43382388dfc3a3679af175c03d19ed8039e79a3e4447e944cd3f3" "checksum prost-build 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b6325275b85605f58f576456a47af44417edf5956a6f670bb59fbe12aff69597" "checksum prost-derive 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9787d1977ea72e8066d58e46ae66100324a2815e677897fe78dfe54958f48252" @@ -1590,10 +1630,13 @@ dependencies = [ "checksum resolv-conf 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8e1b086bb6a2659d6ba66e4aa21bde8a53ec03587cd5c80b83bdc3a330f35cab" "checksum ring 0.13.0-alpha5 (registry+https://github.com/rust-lang/crates.io-index)" = "3845516753f91b4511f9b17c917ea6fa4bc5a7853a9947b0f66731aff51cdef5" "checksum rustc-demangle 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "aee45432acc62f7b9a108cc054142dac51f979e69e71ddce7d6fc7adf29e817e" +"checksum rustc_version 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a54aa04a10c68c1c4eacb4337fd883b435997ede17a9385784b990777686b09a" "checksum rustls 0.12.0 (git+https://github.com/ctz/rustls)" = "" "checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" "checksum sct 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4540aed8d71a5de961a8902cf356e28122bd62695eb5be1c214f84d8704097c" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d" "checksum smallvec 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "03dab98ab5ded3a8b43b2c80751194608d0b2aa0f1d46cf95d1c35e192844aa7" "checksum socket2 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ff606e0486e88f5fc6cfeb3966e434fb409abbc7a3ab495238f70a1ca97f789d" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 16138c270..37d7f0633 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -63,6 +63,7 @@ untrusted = "0.6.1" libc = "0.2" # We can use the `crates.io` version of `inotify` once 0.5.2 has been released. inotify = { git = "https://github.com/inotify-rs/inotify" } +procinfo = "0.4.2" [dev-dependencies] net2 = "0.2" diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index c8b649d38..bb834b36e 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -23,6 +23,8 @@ extern crate libc; extern crate log; #[cfg_attr(test, macro_use)] extern crate indexmap; +#[cfg(target_os = "linux")] +extern crate procinfo; extern crate prost; extern crate prost_types; #[cfg(test)] diff --git a/proxy/src/telemetry/metrics/counter.rs b/proxy/src/telemetry/metrics/counter.rs index 1b473ff8e..00a924a85 100644 --- a/proxy/src/telemetry/metrics/counter.rs +++ b/proxy/src/telemetry/metrics/counter.rs @@ -43,6 +43,12 @@ impl Into for Counter { } } +impl From for Counter { + fn from(value: u64) -> Self { + Counter(Wrapping(value)) + } +} + impl ops::Add for Counter { type Output = Self; fn add(self, Counter(rhs): Self) -> Self::Output { diff --git a/proxy/src/telemetry/metrics/mod.rs b/proxy/src/telemetry/metrics/mod.rs index f66fabb5b..9bb282590 100644 --- a/proxy/src/telemetry/metrics/mod.rs +++ b/proxy/src/telemetry/metrics/mod.rs @@ -56,6 +56,7 @@ mod histogram; mod http; mod labels; mod latency; +mod process; mod record; mod serve; mod transport; @@ -109,6 +110,8 @@ struct Root { transports: transport::OpenScopes, transport_closes: transport::CloseScopes, + process_metrics: Option, + start_time: Gauge, } @@ -182,8 +185,13 @@ impl Root { .expect("process start time") .as_secs(); + let process_metrics = process::Sensor::new() + .map_err(|e| info!("{}", e)) + .ok(); + Self { start_time: t0.into(), + process_metrics, .. Root::default() } } @@ -227,6 +235,13 @@ impl fmt::Display for Root { self.transports.fmt(f)?; self.transport_closes.fmt(f)?; + if let Some(ref process_metrics) = self.process_metrics { + match process_metrics.metrics() { + Ok(process) => process.fmt(f)?, + Err(e) => warn!("error collecting process metrics: {:?}", e), + } + }; + Self::process_start_time_seconds.fmt_help(f)?; Self::process_start_time_seconds.fmt_metric(f, self.start_time)?; diff --git a/proxy/src/telemetry/metrics/process.rs b/proxy/src/telemetry/metrics/process.rs new file mode 100644 index 000000000..30e3a74b8 --- /dev/null +++ b/proxy/src/telemetry/metrics/process.rs @@ -0,0 +1,151 @@ +use std::fmt; +use super::{Counter, Gauge, Metric}; + +pub use self::imp::Sensor; + +#[derive(Copy, Clone)] +pub struct ProcessMetrics { + cpu_seconds_total: Counter, + open_fds: Gauge, + max_fds: Option, + virtual_memory_bytes: Gauge, + resident_memory_bytes: Gauge, +} + +impl ProcessMetrics { + metrics! { + process_cpu_seconds_total: Counter { + "Total user and system CPU time spent in seconds." + }, + process_open_fds: Gauge { "Number of open file descriptors." }, + process_max_fds: Gauge { "Maximum number of open file descriptors." }, + process_virtual_memory_bytes: Gauge { + "Virtual memory size in bytes." + }, + process_resident_memory_bytes: Gauge { + "Resident memory size in bytes." + } + } +} + +impl fmt::Display for ProcessMetrics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Self::process_cpu_seconds_total.fmt_help(f)?; + Self::process_cpu_seconds_total.fmt_metric( + f, + self.cpu_seconds_total + )?; + + Self::process_open_fds.fmt_help(f)?; + Self::process_open_fds.fmt_metric(f, self.open_fds)?; + + if let Some(ref max_fds) = self.max_fds { + Self::process_max_fds.fmt_help(f)?; + Self::process_max_fds.fmt_metric(f, *max_fds)?; + } + + Self::process_virtual_memory_bytes.fmt_help(f)?; + Self::process_virtual_memory_bytes.fmt_metric( + f, + self.virtual_memory_bytes + )?; + + Self::process_resident_memory_bytes.fmt_help(f)?; + Self::process_resident_memory_bytes.fmt_metric( + f, + self.resident_memory_bytes + ) + } +} + +#[cfg(target_os = "linux")] +mod imp { + use super::*; + use super::super::{Counter, Gauge}; + + use std::{io, fs}; + + use procinfo::pid; + use libc::{self, pid_t}; + + #[derive(Debug)] + pub struct Sensor { + page_size: usize, + } + + impl Sensor { + pub fn new() -> io::Result { + let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } { + e if e < 0 => { + let error = io::Error::last_os_error(); + error!("error getting page size: {:?}", error); + return Err(error); + }, + page_size => page_size as usize, + }; + Ok(Sensor { + page_size, + }) + } + + pub fn metrics(&self) -> io::Result { + // XXX potentially blocking call + let stat = pid::stat_self()?; + + let cpu_seconds_total = Counter::from((stat.utime + stat.stime) as u64); + let virtual_memory_bytes = Gauge::from(stat.vsize as u64); + let resident_memory_bytes = Gauge::from((stat.rss * self.page_size) as u64); + + let metrics = ProcessMetrics { + cpu_seconds_total, + virtual_memory_bytes, + resident_memory_bytes, + open_fds: open_fds(stat.pid)?, + max_fds: max_fds()?, + }; + + Ok(metrics) + } + } + + + fn open_fds(pid: pid_t) -> io::Result { + let mut open = 0; + for f in fs::read_dir(format!("/proc/{}/fd", pid))? { + if !f?.file_type()?.is_dir() { + open += 1; + } + } + Ok(Gauge::from(open)) + } + + fn max_fds() -> io::Result> { + let limit = pid::limits_self()?.max_open_files; + let max_fds = limit.soft.or(limit.hard) + .map(|max| Gauge::from(max as u64)); + Ok(max_fds) + } +} + +#[cfg(not(target_os = "linux"))] +mod imp { + use super::*; + use std::io; + + #[derive(Debug)] + pub struct Sensor {} + + impl Sensor { + pub fn new() -> io::Result { + Err(io::Error::new( + io::ErrorKind::Other, + "procinfo not supported on this operating system" + )) + } + + pub fn metrics(&self) -> io::Result { + unreachable!("process::Sensor::metrics() on unsupported OS!") + } + } + +}