grpclb: should not ignore subchannels with CONNECTING state in aggregating the overall LB state (#7959)

We should treat both IDLE and CONNECTING subchannels as "connection in progress" when aggregating for the overall load balancing state. Otherwise, RPCs could fail prematurely if one subchannel enters TF while all the others are still in CONNECTING.

23d279660c made each individual subchannel stay in TF until READY if it previously was in TF. So subchannels with CONNECTING state are those in first time connecting. We should give them time to connect.
This commit is contained in:
Chengyuan Zhang 2021-03-11 16:48:01 -08:00 committed by GitHub
parent afe883119d
commit 9c562c8a6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 6 deletions

View File

@ -776,7 +776,7 @@ final class GrpclbState {
case ROUND_ROBIN: case ROUND_ROBIN:
pickList = new ArrayList<>(backendList.size()); pickList = new ArrayList<>(backendList.size());
Status error = null; Status error = null;
boolean hasIdle = false; boolean hasPending = false;
for (BackendEntry entry : backendList) { for (BackendEntry entry : backendList) {
Subchannel subchannel = entry.subchannel; Subchannel subchannel = entry.subchannel;
Attributes attrs = subchannel.getAttributes(); Attributes attrs = subchannel.getAttributes();
@ -785,12 +785,12 @@ final class GrpclbState {
pickList.add(entry); pickList.add(entry);
} else if (stateInfo.getState() == TRANSIENT_FAILURE) { } else if (stateInfo.getState() == TRANSIENT_FAILURE) {
error = stateInfo.getStatus(); error = stateInfo.getStatus();
} else if (stateInfo.getState() == IDLE) { } else {
hasIdle = true; hasPending = true;
} }
} }
if (pickList.isEmpty()) { if (pickList.isEmpty()) {
if (error != null && !hasIdle) { if (error != null && !hasPending) {
pickList.add(new ErrorEntry(error)); pickList.add(new ErrorEntry(error));
state = TRANSIENT_FAILURE; state = TRANSIENT_FAILURE;
} else { } else {

View File

@ -1175,9 +1175,10 @@ public class GrpclbLoadBalancerTest {
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING)); deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class)); inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class));
// Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too. // Switch all subchannels to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error1"); Status error = Status.UNAVAILABLE.withDescription("error");
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error)); deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error));
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forTransientFailure(error));
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList) assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new ErrorEntry(error)); .containsExactly(new ErrorEntry(error));