Add wait conditions to combat flakiness (#11368)

We intermittently see flaky policy integration test failures like:

```
 failures:
    either

thread 'either' panicked at 'assertion failed: `(left == right)`
  left: `7`,
 right: `0`: blessed uninjected curl must succeed', policy-test/tests/e2e_server_authorization.rs:293:9
```

This test failure is saying that the curl process is returning an exit code of 7 instead of the expected exit code of 0.  This exit code indicates that curl failed to establish a connection.  https://everything.curl.dev/usingcurl/returns

It's unclear why this connection occasionally fails in CI and I have not been able to reproduce this failure locally.

However, by looking at the logic of the integration test, we can see that the integration test creates the `web` Service and the `web` Pod and waits for that pod to become ready before unblocking the curl from executing.  This means that, theoretically, there could be a race condition between the test and the kubernetes endpoints controller.  As soon as the web pod becomes ready, the endpoints controller will update the endpoints resource for the `web` Service and at the same time, our test will unblock the curl command.  If the test wins this race, it is possible that curl will run before the endpoints resource has been updated.

We add an additional wait condition to the test to wait until the endpoints resource has an endpoint before unblocking curl.

Since I could not reproduce the test failure locally, it is impossible to say if this is actually the cause of the flakiness or if this change fixes it.

Signed-off-by: Alex Leong <alex@buoyant.io>
This commit is contained in:
Alex Leong 2023-09-15 17:05:27 -07:00 committed by GitHub
parent 610e2b8309
commit 8579c10d87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 8 deletions

View File

@ -12,8 +12,8 @@ pub use k8s_openapi::{
self,
coordination::v1::Lease,
core::v1::{
Container, ContainerPort, HTTPGetAction, Namespace, Node, NodeSpec, Pod, PodSpec,
PodStatus, Probe, Service, ServiceAccount, ServicePort, ServiceSpec,
Container, ContainerPort, Endpoints, HTTPGetAction, Namespace, Node, NodeSpec, Pod,
PodSpec, PodStatus, Probe, Service, ServiceAccount, ServicePort, ServiceSpec,
},
},
apimachinery::{

View File

@ -168,6 +168,14 @@ pub async fn await_route_status(
.inner
}
// Wait for the endpoints controller to populate the Endpoints resource.
pub fn endpoints_ready(obj: Option<&k8s::Endpoints>) -> bool {
if let Some(ep) = obj {
return ep.subsets.iter().flatten().count() > 0;
}
false
}
#[tracing::instrument(skip_all, fields(%pod, %container))]
pub async fn logs(client: &kube::Client, ns: &str, pod: &str, container: &str) {
let params = kube::api::LogParams {

View File

@ -3,7 +3,10 @@ use linkerd_policy_controller_k8s_api::{
self as k8s,
policy::{LocalTargetRef, NamespacedTargetRef},
};
use linkerd_policy_test::{create, create_ready_pod, curl, web, with_temp_ns, LinkerdInject};
use linkerd_policy_test::{
await_condition, create, create_ready_pod, curl, endpoints_ready, web, with_temp_ns,
LinkerdInject,
};
#[tokio::test(flavor = "current_thread")]
async fn meshtls() {
@ -294,9 +297,12 @@ async fn network() {
create_ready_pod(&client, web::pod(&ns))
);
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
curl.delete_lock().await;
tracing::info!("unblocked curl");
// The blessed pod should be able to connect to the web pod.
let status = blessed.exit_code().await;
@ -372,6 +378,8 @@ async fn both() {
create_ready_pod(&client, web::pod(&ns))
);
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
curl.delete_lock().await;
@ -474,10 +482,12 @@ async fn either() {
create_ready_pod(&client, web::pod(&ns)),
);
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
curl.delete_lock().await;
tracing::info!("unblocking curl");
tracing::info!("unblocked curl");
let (blessed_injected_status, blessed_uninjected_status) =
tokio::join!(blessed_injected.exit_code(), blessed_uninjected.exit_code());

View File

@ -1,7 +1,10 @@
use linkerd_policy_controller_k8s_api::{
self as k8s, policy::server_authorization::Client as ClientAuthz, ResourceExt,
};
use linkerd_policy_test::{create, create_ready_pod, curl, web, with_temp_ns, LinkerdInject};
use linkerd_policy_test::{
await_condition, create, create_ready_pod, curl, endpoints_ready, web, with_temp_ns,
LinkerdInject,
};
#[tokio::test(flavor = "current_thread")]
async fn meshtls() {
@ -88,8 +91,12 @@ async fn network() {
create_ready_pod(&client, web::pod(&ns))
);
tracing::info!("Unblocking curl");
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
curl.delete_lock().await;
tracing::info!("unblocked curl");
// The blessed pod should be able to connect to the web pod.
let status = blessed.exit_code().await;
@ -170,10 +177,12 @@ async fn both() {
create_ready_pod(&client, web::pod(&ns))
);
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
tracing::info!("Unblocking curl");
curl.delete_lock().await;
tracing::info!("unblocked curl");
let (blessed_injected_status, blessed_uninjected_status) =
tokio::join!(blessed_injected.exit_code(), blessed_uninjected.exit_code());
@ -281,8 +290,12 @@ async fn either() {
create_ready_pod(&client, web::pod(&ns)),
);
tracing::info!("Unblocking curl");
await_condition(&client, &ns, "web", endpoints_ready).await;
// Once the web pod is ready, delete the `curl-lock` configmap to
// unblock curl from running.
curl.delete_lock().await;
tracing::info!("unblocked curl");
let (blessed_injected_status, blessed_uninjected_status) =
tokio::join!(blessed_injected.exit_code(), blessed_uninjected.exit_code());