Fix double comma in outbound metrics (#601)

Fixes #600 

The proxy metrics endpoint has a bug where metrics recorded in the outbound direction can contain two commas in a row when no outbound label is present. This occurs because the code for formatting the outbound direction label mistakenly assumed that there would always be a destination pod owner label as well, but the proxy isn't currently aware of the destination's pod owner (waiting for #429). 

I've fixed this issue by moving the place where the comma is output from the `fmt::Display` impl for `RequestLabels` to the `fmt::Display` impl for `OutboudnLabels`. This way, the comma between the `direction` and `dst_*` labels is only output when the `dst_*` label is present. 

This bug made it to master since all of the proxy end-to-end tests for metrics only test the inbound router. I've rectified this issue by adding tests on the outbound router as well (which would fail against the current master due to the double comma bug). I've also added a test that asserts there are no double commas in exported metrics, to protect against regressions to this bug.
This commit is contained in:
Eliza Weisman 2018-03-22 14:17:10 -07:00 committed by GitHub
parent c03508ba8c
commit 3f10c80256
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 174 additions and 6 deletions

View File

@ -413,7 +413,10 @@ impl fmt::Display for RequestLabels {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "authority=\"{}\",", self.authority)?;
if let Some(ref outbound) = self.outbound_labels {
write!(f, "direction=\"outbound\",{}", outbound)?;
write!(f, "direction=\"outbound\"{comma}{dst}",
comma = if !outbound.is_empty() { "," } else { "" },
dst = outbound
)?;
} else {
write!(f, "direction=\"inbound\"")?;
}
@ -425,16 +428,25 @@ impl fmt::Display for RequestLabels {
}
// ===== impl OutboundLabels =====
impl OutboundLabels {
fn is_empty(&self) -> bool {
self.namespace.is_none() && self.dst.is_none()
}
}
impl fmt::Display for OutboundLabels {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OutboundLabels { namespace: Some(ref ns), dst: Some(ref dst) } =>
write!(f, "dst_namespace=\"{}\",dst_{},", ns, dst),
write!(f, "dst_namespace=\"{}\",dst_{}", ns, dst),
OutboundLabels { namespace: None, dst: Some(ref dst), } =>
write!(f, "dst_{},", dst),
write!(f, "dst_{}", dst),
OutboundLabels { namespace: Some(ref ns), dst: None, } =>
write!(f, "dst_namespace=\"{}\",", ns),
write!(f, "dst_namespace=\"{}\"", ns),
OutboundLabels { namespace: None, dst: None, } =>
write!(f, ""),
}

View File

@ -267,7 +267,7 @@ macro_rules! assert_contains {
}
#[test]
fn metrics_endpoint_request_count() {
fn metrics_endpoint_inbound_request_count() {
let _ = env_logger::try_init();
info!("running test server");
@ -296,7 +296,38 @@ fn metrics_endpoint_request_count() {
}
#[test]
fn metrics_endpoint_response_latency() {
fn metrics_endpoint_outbound_request_count() {
let _ = env_logger::try_init();
info!("running test server");
let srv = server::new().route("/hey", "hello").run();
let ctrl = controller::new()
.destination("tele.test.svc.cluster.local", srv.addr)
.run();
let proxy = proxy::new()
.controller(ctrl)
.outbound(srv)
.metrics_flush_interval(Duration::from_millis(500))
.run();
let client = client::new(proxy.outbound, "tele.test.svc.cluster.local");
let metrics = client::http1(proxy.metrics, "localhost");
// prior to seeing any requests, request count should be empty.
assert!(!metrics.get("/metrics")
.contains("request_total{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\"}"));
info!("client.get(/hey)");
assert_eq!(client.get("/hey"), "hello");
let scrape = metrics.get("/metrics");
// after seeing a request, the request count should be 1.
assert_contains!(scrape, "request_total{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\"} 1");
}
#[test]
fn metrics_endpoint_inbound_response_latency() {
let _ = env_logger::try_init();
info!("running test server");
@ -376,3 +407,128 @@ fn metrics_endpoint_response_latency() {
assert_contains!(scrape,
"response_latency_ms_count{authority=\"tele.test.svc.cluster.local\",direction=\"inbound\",status_code=\"200\"} 4");
}
#[test]
fn metrics_endpoint_outbound_response_latency() {
let _ = env_logger::try_init();
info!("running test server");
let srv = server::new()
.route_with_latency("/hey", "hello", Duration::from_millis(500))
.route_with_latency("/hi", "good morning", Duration::from_millis(40))
.run();
let ctrl = controller::new()
.destination("tele.test.svc.cluster.local", srv.addr)
.run();
let proxy = proxy::new()
.controller(ctrl)
.outbound(srv)
.metrics_flush_interval(Duration::from_millis(500))
.run();
let client = client::new(proxy.outbound, "tele.test.svc.cluster.local");
let metrics = client::http1(proxy.metrics, "localhost");
info!("client.get(/hey)");
assert_eq!(client.get("/hey"), "hello");
let scrape = metrics.get("/metrics");
// assert the >=1000ms bucket is incremented by our request with 500ms
// extra latency.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"1000\"} 1");
// the histogram's count should be 1.
assert_contains!(scrape,
"response_latency_ms_count{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\"} 1");
// TODO: we're not going to make any assertions about the
// response_latency_ms_sum stat, since its granularity depends on the actual
// observed latencies, which may vary a bit. we could make more reliable
// assertions about that stat if we were using a mock timer, though, as the
// observed latency values would be predictable.
info!("client.get(/hi)");
assert_eq!(client.get("/hi"), "good morning");
let scrape = metrics.get("/metrics");
// request with 40ms extra latency should fall into the 50ms bucket.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"50\"} 1");
// 1000ms bucket should be incremented as well, since it counts *all*
// bservations less than or equal to 1000ms, even if they also increment
// other buckets.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"1000\"} 2");
// the histogram's total count should be 2.
assert_contains!(scrape,
"response_latency_ms_count{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\"} 2");
info!("client.get(/hi)");
assert_eq!(client.get("/hi"), "good morning");
let scrape = metrics.get("/metrics");
// request with 40ms extra latency should fall into the 50ms bucket.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"50\"} 2");
// 1000ms bucket should be incremented as well.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"1000\"} 3");
// the histogram's total count should be 3.
assert_contains!(scrape,
"response_latency_ms_count{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\"} 3");
info!("client.get(/hey)");
assert_eq!(client.get("/hey"), "hello");
let scrape = metrics.get("/metrics");
// 50ms bucket should be un-changed by the request with 500ms latency.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"50\"} 2");
// 1000ms bucket should be incremented.
assert_contains!(scrape,
"response_latency_ms_bucket{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\",le=\"1000\"} 4");
// the histogram's total count should be 4.
assert_contains!(scrape,
"response_latency_ms_count{authority=\"tele.test.svc.cluster.local\",direction=\"outbound\",status_code=\"200\"} 4");
}
#[test]
fn metrics_have_no_double_commas() {
// Test for regressions to runconduit/conduit#600.
let _ = env_logger::try_init();
info!("running test server");
let inbound_srv = server::new().route("/hey", "hello").run();
let outbound_srv = server::new().route("/hey", "hello").run();
let ctrl = controller::new()
.destination("tele.test.svc.cluster.local", outbound_srv.addr)
.run();
let proxy = proxy::new()
.controller(ctrl)
.inbound(inbound_srv)
.outbound(outbound_srv)
.metrics_flush_interval(Duration::from_millis(500))
.run();
let client = client::new(proxy.inbound, "tele.test.svc.cluster.local");
let metrics = client::http1(proxy.metrics, "localhost");
let scrape = metrics.get("/metrics");
assert!(!scrape.contains(",,"));
info!("inbound.get(/hey)");
assert_eq!(client.get("/hey"), "hello");
let scrape = metrics.get("/metrics");
assert!(!scrape.contains(",,"), "inbound metrics had double comma");
let client = client::new(proxy.outbound, "tele.test.svc.cluster.local");
info!("outbound.get(/hey)");
assert_eq!(client.get("/hey"), "hello");
let scrape = metrics.get("/metrics");
assert!(!scrape.contains(",,"), "outbound metrics had double comma");
}