grpclb: enter fallback mode immediately when balancer and all backend… (#4007)

grpclb: enter fallback mode immediately when balancer and all backend connections are lost

Changed according to updated spec.
This commit is contained in:
Kun Zhang 2018-02-07 14:42:00 -08:00 committed by GitHub
parent de54a4cb49
commit f44fc50310
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 82 additions and 99 deletions

View File

@ -101,7 +101,7 @@ final class GrpclbState {
private static final Attributes.Key<AtomicReference<ConnectivityStateInfo>> STATE_INFO = private static final Attributes.Key<AtomicReference<ConnectivityStateInfo>> STATE_INFO =
Attributes.Key.of("io.grpc.grpclb.GrpclbLoadBalancer.stateInfo"); Attributes.Key.of("io.grpc.grpclb.GrpclbLoadBalancer.stateInfo");
// Reset to null when timer expires or cancelled. // Scheduled only once. Never reset.
@Nullable @Nullable
private FallbackModeTask fallbackTimer; private FallbackModeTask fallbackTimer;
private List<EquivalentAddressGroup> fallbackBackendList = Collections.emptyList(); private List<EquivalentAddressGroup> fallbackBackendList = Collections.emptyList();
@ -146,7 +146,7 @@ final class GrpclbState {
subchannel.requestConnection(); subchannel.requestConnection();
} }
subchannel.getAttributes().get(STATE_INFO).set(newState); subchannel.getAttributes().get(STATE_INFO).set(newState);
maybeStartFallbackTimer(); maybeUseFallbackBackends();
maybeUpdatePicker(); maybeUpdatePicker();
} }
@ -171,7 +171,12 @@ final class GrpclbState {
startLbRpc(); startLbRpc();
} }
fallbackBackendList = newBackendServers; fallbackBackendList = newBackendServers;
maybeStartFallbackTimer(); // Start the fallback timer if it's never started
if (fallbackTimer == null) {
logger.log(Level.FINE, "[{0}] Starting fallback timer.", new Object[] {logId});
fallbackTimer = new FallbackModeTask();
fallbackTimer.schedule();
}
if (usingFallbackBackends) { if (usingFallbackBackends) {
// Populate the new fallback backends to round-robin list. // Populate the new fallback backends to round-robin list.
useFallbackBackends(); useFallbackBackends();
@ -179,22 +184,16 @@ final class GrpclbState {
maybeUpdatePicker(); maybeUpdatePicker();
} }
/** private void maybeUseFallbackBackends() {
* Start the fallback timer if it's not already started and all connections are lost.
*/
private void maybeStartFallbackTimer() {
if (fallbackTimer != null) {
return;
}
if (fallbackBackendList.isEmpty()) {
return;
}
if (balancerWorking) { if (balancerWorking) {
return; return;
} }
if (usingFallbackBackends) { if (usingFallbackBackends) {
return; return;
} }
if (fallbackTimer != null && !fallbackTimer.discarded) {
return;
}
int numReadySubchannels = 0; int numReadySubchannels = 0;
for (Subchannel subchannel : subchannels.values()) { for (Subchannel subchannel : subchannels.values()) {
if (subchannel.getAttributes().get(STATE_INFO).get().getState() == READY) { if (subchannel.getAttributes().get(STATE_INFO).get().getState() == READY) {
@ -204,9 +203,8 @@ final class GrpclbState {
if (numReadySubchannels > 0) { if (numReadySubchannels > 0) {
return; return;
} }
logger.log(Level.FINE, "[{0}] Starting fallback timer.", new Object[] {logId}); // Fallback contiditions met
fallbackTimer = new FallbackModeTask(); useFallbackBackends();
fallbackTimer.schedule();
} }
/** /**
@ -275,7 +273,6 @@ final class GrpclbState {
private void cancelFallbackTimer() { private void cancelFallbackTimer() {
if (fallbackTimer != null) { if (fallbackTimer != null) {
fallbackTimer.cancel(); fallbackTimer.cancel();
fallbackTimer = null;
} }
} }
@ -358,28 +355,24 @@ final class GrpclbState {
@VisibleForTesting @VisibleForTesting
class FallbackModeTask implements Runnable { class FallbackModeTask implements Runnable {
private ScheduledFuture<?> scheduledFuture; private ScheduledFuture<?> scheduledFuture;
// If the scheduledFuture is cancelled after the task has made it into the ChannelExecutor, the private boolean discarded;
// task will be started anyway. Use this boolean to signal that the task should not be run.
private boolean cancelled;
@Override @Override
public void run() { public void run() {
helper.runSerialized(new Runnable() { helper.runSerialized(new Runnable() {
@Override @Override
public void run() { public void run() {
if (!cancelled) { checkState(fallbackTimer == FallbackModeTask.this, "fallback timer mismatch");
checkState(fallbackTimer == FallbackModeTask.this, "fallback timer mismatch"); discarded = true;
fallbackTimer = null; maybeUseFallbackBackends();
useFallbackBackends(); maybeUpdatePicker();
maybeUpdatePicker();
}
} }
}); });
} }
void cancel() { void cancel() {
discarded = true;
scheduledFuture.cancel(false); scheduledFuture.cancel(false);
cancelled = true;
} }
void schedule() { void schedule() {
@ -556,7 +549,8 @@ final class GrpclbState {
cleanUp(); cleanUp();
propagateError(error); propagateError(error);
balancerWorking = false; balancerWorking = false;
maybeStartFallbackTimer(); maybeUseFallbackBackends();
maybeUpdatePicker();
startLbRpc(); startLbRpc();
} }

View File

@ -373,8 +373,8 @@ public class GrpclbLoadBalancerTest {
.set(GrpclbConstants.ATTR_LB_POLICY, LbPolicy.GRPCLB).build(); .set(GrpclbConstants.ATTR_LB_POLICY, LbPolicy.GRPCLB).build();
deliverResolvedAddresses(grpclbResolutionList, grpclbResolutionAttrs); deliverResolvedAddresses(grpclbResolutionList, grpclbResolutionAttrs);
// No backend address from resolver. Fallback timer is not started. // Fallback timer is started as soon as address is resolved.
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER)); assertEquals(1, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
assertEquals(1, fakeOobChannels.size()); assertEquals(1, fakeOobChannels.size());
verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture()); verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
@ -1022,6 +1022,9 @@ public class GrpclbLoadBalancerTest {
.set(GrpclbConstants.ATTR_LB_POLICY, LbPolicy.GRPCLB).build(); .set(GrpclbConstants.ATTR_LB_POLICY, LbPolicy.GRPCLB).build();
deliverResolvedAddresses(grpclbResolutionList, grpclbResolutionAttrs); deliverResolvedAddresses(grpclbResolutionList, grpclbResolutionAttrs);
// Fallback timer is started as soon as the addresses are resolved.
assertEquals(1, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
assertSame(LbPolicy.GRPCLB, balancer.getLbPolicy()); assertSame(LbPolicy.GRPCLB, balancer.getLbPolicy());
assertNull(balancer.getDelegate()); assertNull(balancer.getDelegate());
verify(helper).createOobChannel(addrsEq(grpclbResolutionList.get(0)), eq(lbAuthority(0))); verify(helper).createOobChannel(addrsEq(grpclbResolutionList.get(0)), eq(lbAuthority(0)));
@ -1036,9 +1039,6 @@ public class GrpclbLoadBalancerTest {
InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build()) InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
.build())); .build()));
// No backend address from resolver. Fallback timer is not started.
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
// Simulate receiving LB response // Simulate receiving LB response
List<ServerEntry> backends1 = Arrays.asList( List<ServerEntry> backends1 = Arrays.asList(
new ServerEntry("127.0.0.1", 2000, "token0001"), new ServerEntry("127.0.0.1", 2000, "token0001"),
@ -1209,8 +1209,7 @@ public class GrpclbLoadBalancerTest {
subtestGrpclbFallbackInitialTimeout(true); subtestGrpclbFallbackInitialTimeout(true);
} }
// Fallback within the period of the initial timeout, where the server list is not received from // Fallback or not within the period of the initial timeout.
// the balancer.
private void subtestGrpclbFallbackInitialTimeout(boolean timerExpires) { private void subtestGrpclbFallbackInitialTimeout(boolean timerExpires) {
long loadReportIntervalMillis = 1983; long loadReportIntervalMillis = 1983;
InOrder helperInOrder = inOrder(helper); InOrder helperInOrder = inOrder(helper);
@ -1247,6 +1246,29 @@ public class GrpclbLoadBalancerTest {
fakeClock.forwardTime(GrpclbState.FALLBACK_TIMEOUT_MS - 1, TimeUnit.MILLISECONDS); fakeClock.forwardTime(GrpclbState.FALLBACK_TIMEOUT_MS - 1, TimeUnit.MILLISECONDS);
assertEquals(1, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER)); assertEquals(1, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
/////////////////////////////////////////////
// Break the LB stream before timer expires
/////////////////////////////////////////////
Status streamError = Status.UNAVAILABLE.withDescription("OOB stream broken");
lbResponseObserver.onError(streamError.asException());
// Not in fallback mode. The error will be propagated.
verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
RoundRobinPicker picker = (RoundRobinPicker) pickerCaptor.getValue();
assertThat(picker.dropList).isEmpty();
ErrorEntry errorEntry = (ErrorEntry) Iterables.getOnlyElement(picker.pickList);
Status status = errorEntry.result.getStatus();
assertThat(status.getCode()).isEqualTo(streamError.getCode());
assertThat(status.getDescription()).contains(streamError.getDescription());
// A new stream is created
verify(mockLbService, times(2)).balanceLoad(lbResponseObserverCaptor.capture());
lbResponseObserver = lbResponseObserverCaptor.getValue();
assertEquals(1, lbRequestObservers.size());
lbRequestObserver = lbRequestObservers.poll();
verify(lbRequestObserver).onNext(
eq(LoadBalanceRequest.newBuilder().setInitialRequest(
InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
.build()));
////////////////////////////////// //////////////////////////////////
// Fallback timer expires (or not) // Fallback timer expires (or not)
////////////////////////////////// //////////////////////////////////
@ -1301,37 +1323,26 @@ public class GrpclbLoadBalancerTest {
helperInOrder, helper, Arrays.asList(resolutionList.get(1), resolutionList.get(2))); helperInOrder, helper, Arrays.asList(resolutionList.get(1), resolutionList.get(2)));
} }
/////////////////////// ////////////////////////////////////////////////
// Break the LB stream // Break the LB stream after the timer expires
/////////////////////// ////////////////////////////////////////////////
Status streamError = Status.UNAVAILABLE.withDescription("OOB stream broken");
lbResponseObserver.onError(streamError.asException());
if (timerExpires) { if (timerExpires) {
lbResponseObserver.onError(streamError.asException());
// The error will NOT propagate to picker because fallback list is in use. // The error will NOT propagate to picker because fallback list is in use.
helperInOrder.verify(helper, never()) helperInOrder.verify(helper, never())
.updateBalancingState(any(ConnectivityState.class), any(SubchannelPicker.class)); .updateBalancingState(any(ConnectivityState.class), any(SubchannelPicker.class));
} else { // A new stream is created
// Not in fallback mode. The error will be propagated. verify(mockLbService, times(3)).balanceLoad(lbResponseObserverCaptor.capture());
verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); lbResponseObserver = lbResponseObserverCaptor.getValue();
RoundRobinPicker picker = (RoundRobinPicker) pickerCaptor.getValue(); assertEquals(1, lbRequestObservers.size());
assertThat(picker.dropList).isEmpty(); lbRequestObserver = lbRequestObservers.poll();
ErrorEntry errorEntry = (ErrorEntry) Iterables.getOnlyElement(picker.pickList); verify(lbRequestObserver).onNext(
Status status = errorEntry.result.getStatus(); eq(LoadBalanceRequest.newBuilder().setInitialRequest(
assertThat(status.getCode()).isEqualTo(streamError.getCode()); InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
assertThat(status.getDescription()).contains(streamError.getDescription()); .build()));
} }
// A new stream is created
verify(mockLbService, times(2)).balanceLoad(lbResponseObserverCaptor.capture());
lbResponseObserver = lbResponseObserverCaptor.getValue();
assertEquals(1, lbRequestObservers.size());
lbRequestObserver = lbRequestObservers.poll();
verify(lbRequestObserver).onNext(
eq(LoadBalanceRequest.newBuilder().setInitialRequest(
InitialLoadBalanceRequest.newBuilder().setName(SERVICE_AUTHORITY).build())
.build()));
///////////////////////////////// /////////////////////////////////
// Balancer returns a server list // Balancer returns a server list
///////////////////////////////// /////////////////////////////////
@ -1359,28 +1370,23 @@ public class GrpclbLoadBalancerTest {
} }
@Test @Test
public void grpclbFallback_balancerLost_timerExpires() { public void grpclbFallback_balancerLost() {
subtestGrpclbFallbackConnectionLost(true, false, true); subtestGrpclbFallbackConnectionLost(true, false);
} }
@Test @Test
public void grpclbFallback_subchannelsLost_timerExpires() { public void grpclbFallback_subchannelsLost() {
subtestGrpclbFallbackConnectionLost(false, true, true); subtestGrpclbFallbackConnectionLost(false, true);
} }
@Test @Test
public void grpclbFallback_allLost_timerExpires() { public void grpclbFallback_allLost() {
subtestGrpclbFallbackConnectionLost(true, true, true); subtestGrpclbFallbackConnectionLost(true, true);
}
@Test
public void grpclbFallback_allLost_ResumeBeforeTimerExpires() {
subtestGrpclbFallbackConnectionLost(true, true, false);
} }
// Fallback outside of the initial timeout, where all connections are lost. // Fallback outside of the initial timeout, where all connections are lost.
private void subtestGrpclbFallbackConnectionLost( private void subtestGrpclbFallbackConnectionLost(
boolean balancerBroken, boolean allSubchannelsBroken, boolean timerExpires) { boolean balancerBroken, boolean allSubchannelsBroken) {
long loadReportIntervalMillis = 1983; long loadReportIntervalMillis = 1983;
InOrder inOrder = inOrder(helper, mockLbService); InOrder inOrder = inOrder(helper, mockLbService);
@ -1419,9 +1425,6 @@ public class GrpclbLoadBalancerTest {
lbResponseObserver.onNext(buildInitialResponse()); lbResponseObserver.onNext(buildInitialResponse());
lbResponseObserver.onNext(buildLbResponse(serverList)); lbResponseObserver.onNext(buildLbResponse(serverList));
// No fallback timer scheduled
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
List<Subchannel> subchannels = List<Subchannel> subchannels =
fallbackTestVerifyUseOfBalancerBackendLists(inOrder, helper, serverList); fallbackTestVerifyUseOfBalancerBackendLists(inOrder, helper, serverList);
@ -1442,24 +1445,14 @@ public class GrpclbLoadBalancerTest {
} }
if (balancerBroken && allSubchannelsBroken) { if (balancerBroken && allSubchannelsBroken) {
// Fallback timer is scheduled if all connections are lost. // Going into fallback
assertEquals(1, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER)); subchannels = fallbackTestVerifyUseOfFallbackBackendLists(
if (timerExpires) { inOrder, helper, Arrays.asList(resolutionList.get(0), resolutionList.get(2)));
fakeClock.forwardTime(GrpclbState.FALLBACK_TIMEOUT_MS, TimeUnit.MILLISECONDS);
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
// Going into fallback // When in fallback mode, fallback timer should not be scheduled when all backend
subchannels = fallbackTestVerifyUseOfFallbackBackendLists( // connections are lost
inOrder, helper, Arrays.asList(resolutionList.get(0), resolutionList.get(2))); for (Subchannel subchannel : subchannels) {
deliverSubchannelState(subchannel, ConnectivityStateInfo.forNonError(IDLE));
// When in fallback mode, fallback timer should not be scheduled when all backend
// connections are lost
for (Subchannel subchannel : subchannels) {
deliverSubchannelState(subchannel, ConnectivityStateInfo.forNonError(IDLE));
}
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
} else {
fakeClock.forwardTime(GrpclbState.FALLBACK_TIMEOUT_MS - 1, TimeUnit.MILLISECONDS);
} }
// Exit fallback mode or cancel fallback timer when receiving a new server list from balancer // Exit fallback mode or cancel fallback timer when receiving a new server list from balancer
@ -1469,15 +1462,11 @@ public class GrpclbLoadBalancerTest {
lbResponseObserver.onNext(buildInitialResponse()); lbResponseObserver.onNext(buildInitialResponse());
lbResponseObserver.onNext(buildLbResponse(serverList2)); lbResponseObserver.onNext(buildLbResponse(serverList2));
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER)); fallbackTestVerifyUseOfBalancerBackendLists(inOrder, helper, serverList2);
if (timerExpires) {
fallbackTestVerifyUseOfBalancerBackendLists(inOrder, helper, serverList2);
}
} else {
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
} }
assertEquals(0, fakeClock.numPendingTasks(FALLBACK_MODE_TASK_FILTER));
if (!(balancerBroken && allSubchannelsBroken && timerExpires)) { if (!(balancerBroken && allSubchannelsBroken)) {
verify(helper, never()).createSubchannel(eq(resolutionList.get(0)), any(Attributes.class)); verify(helper, never()).createSubchannel(eq(resolutionList.get(0)), any(Attributes.class));
verify(helper, never()).createSubchannel(eq(resolutionList.get(2)), any(Attributes.class)); verify(helper, never()).createSubchannel(eq(resolutionList.get(2)), any(Attributes.class));
} }