xds: include node ID in RPC failure status messages from the XdsClient (#9099)

This commit is contained in:
Sergii Tkachenko 2022-04-21 09:52:31 -07:00 committed by GitHub
parent 3a303af02f
commit a5829107c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 64 additions and 42 deletions

View File

@ -2537,8 +2537,16 @@ final class ClientXdsClient extends XdsClient implements XdsResponseHandler, Res
respTimer.cancel(); respTimer.cancel();
respTimer = null; respTimer = null;
} }
// Include node ID in xds failures to allow cross-referencing with control plane logs
// when debugging.
String description = error.getDescription() == null ? "" : error.getDescription() + " ";
Status errorAugmented = Status.fromCode(error.getCode())
.withDescription(description + "nodeID: " + bootstrapInfo.node().getId())
.withCause(error.getCause());
for (ResourceWatcher watcher : watchers) { for (ResourceWatcher watcher : watchers) {
watcher.onError(error); watcher.onError(errorAugmented);
} }
} }

View File

@ -398,6 +398,13 @@ abstract class XdsClient {
/** /**
* Called when the resource discovery RPC encounters some transient error. * Called when the resource discovery RPC encounters some transient error.
*
* <p>Note that we expect that the implementer to:
* - Comply with the guarantee to not generate certain statuses by the library:
* https://grpc.github.io/grpc/core/md_doc_statuscodes.html. If the code needs to be
* propagated to the channel, override it with {@link Status.Code#UNAVAILABLE}.
* - Keep {@link Status} description in one form or another, as it contains valuable debugging
* information.
*/ */
void onError(Status error); void onError(Status error);

View File

@ -126,7 +126,8 @@ public abstract class ClientXdsClientTestBase {
private static final String VERSION_1 = "42"; private static final String VERSION_1 = "42";
private static final String VERSION_2 = "43"; private static final String VERSION_2 = "43";
private static final String VERSION_3 = "44"; private static final String VERSION_3 = "44";
private static final Node NODE = Node.newBuilder().build(); private static final String NODE_ID = "cool-node-id";
private static final Node NODE = Node.newBuilder().setId(NODE_ID).build();
private static final Any FAILING_ANY = MessageFactory.FAILING_ANY; private static final Any FAILING_ANY = MessageFactory.FAILING_ANY;
private static final ChannelCredentials CHANNEL_CREDENTIALS = InsecureChannelCredentials.create(); private static final ChannelCredentials CHANNEL_CREDENTIALS = InsecureChannelCredentials.create();
private final ServerInfo lrsServerInfo = private final ServerInfo lrsServerInfo =
@ -314,7 +315,7 @@ public abstract class ClientXdsClientTestBase {
Bootstrapper.BootstrapInfo.builder() Bootstrapper.BootstrapInfo.builder()
.servers(Arrays.asList( .servers(Arrays.asList(
Bootstrapper.ServerInfo.create(SERVER_URI, CHANNEL_CREDENTIALS, useProtocolV3()))) Bootstrapper.ServerInfo.create(SERVER_URI, CHANNEL_CREDENTIALS, useProtocolV3())))
.node(EnvoyProtoData.Node.newBuilder().build()) .node(NODE)
.authorities(ImmutableMap.of( .authorities(ImmutableMap.of(
"authority.xds.com", "authority.xds.com",
AuthorityInfo.create( AuthorityInfo.create(
@ -467,6 +468,15 @@ public abstract class ClientXdsClientTestBase {
return metadata; return metadata;
} }
private void verifyStatusWithNodeId(Status status, Code expectedCode, String expectedMsg) {
assertThat(status.getCode()).isEqualTo(expectedCode);
assertThat(status.getCause()).isNull();
// Watcher.onError propagates status description to the channel, and we want to
// augment the description with the node id.
String description = (expectedMsg.isEmpty() ? "" : expectedMsg + " ") + "nodeID: " + NODE_ID;
assertThat(status.getDescription()).isEqualTo(description);
}
/** /**
* Helper method to validate {@link XdsClient.EdsUpdate} created for the test CDS resource * Helper method to validate {@link XdsClient.EdsUpdate} created for the test CDS resource
* {@link ClientXdsClientTestBase#testClusterLoadAssignment}. * {@link ClientXdsClientTestBase#testClusterLoadAssignment}.
@ -1840,11 +1850,8 @@ public abstract class ClientXdsClientTestBase {
+ "io.grpc.xds.ClientXdsClient$ResourceInvalidException: " + "io.grpc.xds.ClientXdsClient$ResourceInvalidException: "
+ "ca_certificate_provider_instance is required in upstream-tls-context"; + "ca_certificate_provider_instance is required in upstream-tls-context";
call.verifyRequestNack(CDS, CDS_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg)); call.verifyRequestNack(CDS, CDS_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg));
ArgumentCaptor<Status> captor = ArgumentCaptor.forClass(Status.class); verify(cdsResourceWatcher).onError(errorCaptor.capture());
verify(cdsResourceWatcher).onError(captor.capture()); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
Status errorStatus = captor.getValue();
assertThat(errorStatus.getCode()).isEqualTo(Status.UNAVAILABLE.getCode());
assertThat(errorStatus.getDescription()).isEqualTo(errorMsg);
} }
/** /**
@ -1866,11 +1873,8 @@ public abstract class ClientXdsClientTestBase {
String errorMsg = "CDS response Cluster 'cluster.googleapis.com' validation error: " String errorMsg = "CDS response Cluster 'cluster.googleapis.com' validation error: "
+ "transport-socket with name envoy.transport_sockets.bad not supported."; + "transport-socket with name envoy.transport_sockets.bad not supported.";
call.verifyRequestNack(CDS, CDS_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg)); call.verifyRequestNack(CDS, CDS_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg));
ArgumentCaptor<Status> captor = ArgumentCaptor.forClass(Status.class); verify(cdsResourceWatcher).onError(errorCaptor.capture());
verify(cdsResourceWatcher).onError(captor.capture()); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
Status errorStatus = captor.getValue();
assertThat(errorStatus.getCode()).isEqualTo(Status.UNAVAILABLE.getCode());
assertThat(errorStatus.getDescription()).isEqualTo(errorMsg);
} }
@Test @Test
@ -2461,13 +2465,13 @@ public abstract class ClientXdsClientTestBase {
// Management server closes the RPC stream with an error. // Management server closes the RPC stream with an error.
call.sendError(Status.UNKNOWN.asException()); call.sendError(Status.UNKNOWN.asException());
verify(ldsResourceWatcher).onError(errorCaptor.capture()); verify(ldsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNKNOWN); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNKNOWN, "");
verify(rdsResourceWatcher).onError(errorCaptor.capture()); verify(rdsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNKNOWN); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNKNOWN, "");
verify(cdsResourceWatcher).onError(errorCaptor.capture()); verify(cdsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNKNOWN); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNKNOWN, "");
verify(edsResourceWatcher).onError(errorCaptor.capture()); verify(edsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNKNOWN); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNKNOWN, "");
// Retry after backoff. // Retry after backoff.
inOrder.verify(backoffPolicyProvider).get(); inOrder.verify(backoffPolicyProvider).get();
@ -2483,15 +2487,16 @@ public abstract class ClientXdsClientTestBase {
call.verifyRequest(EDS, EDS_RESOURCE, "", "", NODE); call.verifyRequest(EDS, EDS_RESOURCE, "", "", NODE);
// Management server becomes unreachable. // Management server becomes unreachable.
call.sendError(Status.UNAVAILABLE.asException()); String errorMsg = "my fault";
call.sendError(Status.UNAVAILABLE.withDescription(errorMsg).asException());
verify(ldsResourceWatcher, times(2)).onError(errorCaptor.capture()); verify(ldsResourceWatcher, times(2)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
verify(rdsResourceWatcher, times(2)).onError(errorCaptor.capture()); verify(rdsResourceWatcher, times(2)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
verify(cdsResourceWatcher, times(2)).onError(errorCaptor.capture()); verify(cdsResourceWatcher, times(2)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
verify(edsResourceWatcher, times(2)).onError(errorCaptor.capture()); verify(edsResourceWatcher, times(2)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
// Retry after backoff. // Retry after backoff.
inOrder.verify(backoffPolicy1).nextBackoffNanos(); inOrder.verify(backoffPolicy1).nextBackoffNanos();
@ -2518,13 +2523,13 @@ public abstract class ClientXdsClientTestBase {
call.sendError(Status.DEADLINE_EXCEEDED.asException()); call.sendError(Status.DEADLINE_EXCEEDED.asException());
verify(ldsResourceWatcher, times(3)).onError(errorCaptor.capture()); verify(ldsResourceWatcher, times(3)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.DEADLINE_EXCEEDED); verifyStatusWithNodeId(errorCaptor.getValue(), Code.DEADLINE_EXCEEDED, "");
verify(rdsResourceWatcher, times(3)).onError(errorCaptor.capture()); verify(rdsResourceWatcher, times(3)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.DEADLINE_EXCEEDED); verifyStatusWithNodeId(errorCaptor.getValue(), Code.DEADLINE_EXCEEDED, "");
verify(cdsResourceWatcher, times(3)).onError(errorCaptor.capture()); verify(cdsResourceWatcher, times(3)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.DEADLINE_EXCEEDED); verifyStatusWithNodeId(errorCaptor.getValue(), Code.DEADLINE_EXCEEDED, "");
verify(edsResourceWatcher, times(3)).onError(errorCaptor.capture()); verify(edsResourceWatcher, times(3)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.DEADLINE_EXCEEDED); verifyStatusWithNodeId(errorCaptor.getValue(), Code.DEADLINE_EXCEEDED, "");
// Reset backoff sequence and retry after backoff. // Reset backoff sequence and retry after backoff.
inOrder.verify(backoffPolicyProvider).get(); inOrder.verify(backoffPolicyProvider).get();
@ -2542,13 +2547,13 @@ public abstract class ClientXdsClientTestBase {
// Management server becomes unreachable again. // Management server becomes unreachable again.
call.sendError(Status.UNAVAILABLE.asException()); call.sendError(Status.UNAVAILABLE.asException());
verify(ldsResourceWatcher, times(4)).onError(errorCaptor.capture()); verify(ldsResourceWatcher, times(4)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(rdsResourceWatcher, times(4)).onError(errorCaptor.capture()); verify(rdsResourceWatcher, times(4)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(cdsResourceWatcher, times(4)).onError(errorCaptor.capture()); verify(cdsResourceWatcher, times(4)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(edsResourceWatcher, times(4)).onError(errorCaptor.capture()); verify(edsResourceWatcher, times(4)).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
// Retry after backoff. // Retry after backoff.
inOrder.verify(backoffPolicy2).nextBackoffNanos(); inOrder.verify(backoffPolicy2).nextBackoffNanos();
@ -2572,9 +2577,9 @@ public abstract class ClientXdsClientTestBase {
DiscoveryRpcCall call = resourceDiscoveryCalls.poll(); DiscoveryRpcCall call = resourceDiscoveryCalls.poll();
call.sendError(Status.UNAVAILABLE.asException()); call.sendError(Status.UNAVAILABLE.asException());
verify(ldsResourceWatcher).onError(errorCaptor.capture()); verify(ldsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(rdsResourceWatcher).onError(errorCaptor.capture()); verify(rdsResourceWatcher).onError(errorCaptor.capture());
assertThat(errorCaptor.getValue().getCode()).isEqualTo(Code.UNAVAILABLE); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
ScheduledTask retryTask = ScheduledTask retryTask =
Iterables.getOnlyElement(fakeClock.getPendingTasks(RPC_RETRY_TASK_FILTER)); Iterables.getOnlyElement(fakeClock.getPendingTasks(RPC_RETRY_TASK_FILTER));
assertThat(retryTask.getDelay(TimeUnit.NANOSECONDS)).isEqualTo(10L); assertThat(retryTask.getDelay(TimeUnit.NANOSECONDS)).isEqualTo(10L);
@ -2621,6 +2626,14 @@ public abstract class ClientXdsClientTestBase {
call.sendError(Status.UNAVAILABLE.asException()); call.sendError(Status.UNAVAILABLE.asException());
assertThat(cdsResourceTimeout.isCancelled()).isTrue(); assertThat(cdsResourceTimeout.isCancelled()).isTrue();
assertThat(edsResourceTimeout.isCancelled()).isTrue(); assertThat(edsResourceTimeout.isCancelled()).isTrue();
verify(ldsResourceWatcher).onError(errorCaptor.capture());
verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(rdsResourceWatcher).onError(errorCaptor.capture());
verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(cdsResourceWatcher).onError(errorCaptor.capture());
verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
verify(edsResourceWatcher).onError(errorCaptor.capture());
verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, "");
fakeClock.forwardNanos(10L); fakeClock.forwardNanos(10L);
assertThat(fakeClock.getPendingTasks(LDS_RESOURCE_FETCH_TIMEOUT_TASK_FILTER)).hasSize(0); assertThat(fakeClock.getPendingTasks(LDS_RESOURCE_FETCH_TIMEOUT_TASK_FILTER)).hasSize(0);
@ -2740,11 +2753,8 @@ public abstract class ClientXdsClientTestBase {
+ "0.0.0.0:7000\' validation error: " + "0.0.0.0:7000\' validation error: "
+ "common-tls-context is required in downstream-tls-context"; + "common-tls-context is required in downstream-tls-context";
call.verifyRequestNack(LDS, LISTENER_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg)); call.verifyRequestNack(LDS, LISTENER_RESOURCE, "", "0000", NODE, ImmutableList.of(errorMsg));
ArgumentCaptor<Status> captor = ArgumentCaptor.forClass(Status.class); verify(ldsResourceWatcher).onError(errorCaptor.capture());
verify(ldsResourceWatcher).onError(captor.capture()); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
Status errorStatus = captor.getValue();
assertThat(errorStatus.getCode()).isEqualTo(Status.UNAVAILABLE.getCode());
assertThat(errorStatus.getDescription()).isEqualTo(errorMsg);
} }
@Test @Test
@ -2770,11 +2780,8 @@ public abstract class ClientXdsClientTestBase {
+ "transport-socket with name envoy.transport_sockets.bad1 not supported."; + "transport-socket with name envoy.transport_sockets.bad1 not supported.";
call.verifyRequestNack(LDS, LISTENER_RESOURCE, "", "0000", NODE, ImmutableList.of( call.verifyRequestNack(LDS, LISTENER_RESOURCE, "", "0000", NODE, ImmutableList.of(
errorMsg)); errorMsg));
ArgumentCaptor<Status> captor = ArgumentCaptor.forClass(Status.class); verify(ldsResourceWatcher).onError(errorCaptor.capture());
verify(ldsResourceWatcher).onError(captor.capture()); verifyStatusWithNodeId(errorCaptor.getValue(), Code.UNAVAILABLE, errorMsg);
Status errorStatus = captor.getValue();
assertThat(errorStatus.getCode()).isEqualTo(Status.UNAVAILABLE.getCode());
assertThat(errorStatus.getDescription()).isEqualTo(errorMsg);
} }
private DiscoveryRpcCall startResourceWatcher( private DiscoveryRpcCall startResourceWatcher(