core: fix a bug for hedging with throttling (#7337)

Resolves #7222: If a hedging substream fails triggering throttling threshold, the call should be committed.

Refactored RetryPlan to two separate classes RetryPlan and HedgingPlan.
This commit is contained in:
ZHANG Dapeng 2020-08-18 23:12:25 -07:00 committed by GitHub
parent cb07b0fb45
commit a91acec2d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 120 additions and 73 deletions

View File

@ -783,7 +783,6 @@ abstract class RetriableStream<ReqT> implements ClientStream {
} }
if (state.winningSubstream == null) { if (state.winningSubstream == null) {
boolean isFatal = false;
if (rpcProgress == RpcProgress.REFUSED if (rpcProgress == RpcProgress.REFUSED
&& noMoreTransparentRetry.compareAndSet(false, true)) { && noMoreTransparentRetry.compareAndSet(false, true)) {
// transparent retry // transparent retry
@ -837,6 +836,25 @@ abstract class RetriableStream<ReqT> implements ClientStream {
nextBackoffIntervalNanos = retryPolicy.initialBackoffNanos; nextBackoffIntervalNanos = retryPolicy.initialBackoffNanos;
} }
if (isHedging) {
HedgingPlan hedgingPlan = makeHedgingDecision(status, trailers);
if (hedgingPlan.isHedgeable) {
pushbackHedging(hedgingPlan.hedgingPushbackMillis);
}
synchronized (lock) {
state = state.removeActiveHedge(substream);
// The invariant is whether or not #(Potential Hedge + active hedges) > 0.
// Once hasPotentialHedging(state) is false, it will always be false, and then
// #(state.activeHedges) will be decreasing. This guarantees that even there may be
// multiple concurrent hedges, one of the hedges will end up committed.
if (hedgingPlan.isHedgeable) {
if (hasPotentialHedging(state) || !state.activeHedges.isEmpty()) {
return;
}
// else, no activeHedges, no new hedges possible, try to commit
} // else, isHedgeable is false, try to commit
}
} else {
RetryPlan retryPlan = makeRetryDecision(status, trailers); RetryPlan retryPlan = makeRetryDecision(status, trailers);
if (retryPlan.shouldRetry) { if (retryPlan.shouldRetry) {
// The check state.winningSubstream == null, checking if is not already committed, is // The check state.winningSubstream == null, checking if is not already committed, is
@ -845,16 +863,18 @@ abstract class RetriableStream<ReqT> implements ClientStream {
synchronized (lock) { synchronized (lock) {
scheduledRetry = scheduledRetryCopy = new FutureCanceller(lock); scheduledRetry = scheduledRetryCopy = new FutureCanceller(lock);
} }
scheduledRetryCopy.setFuture(scheduledExecutorService.schedule( scheduledRetryCopy.setFuture(
scheduledExecutorService.schedule(
new Runnable() { new Runnable() {
@Override @Override
public void run() { public void run() {
callExecutor.execute(new Runnable() { callExecutor.execute(
new Runnable() {
@Override @Override
public void run() { public void run() {
// retry // retry
Substream newSubstream Substream newSubstream =
= createSubstream(substream.previousAttemptCount + 1); createSubstream(substream.previousAttemptCount + 1);
drain(newSubstream); drain(newSubstream);
} }
}); });
@ -864,24 +884,6 @@ abstract class RetriableStream<ReqT> implements ClientStream {
TimeUnit.NANOSECONDS)); TimeUnit.NANOSECONDS));
return; return;
} }
isFatal = retryPlan.isFatal;
pushbackHedging(retryPlan.hedgingPushbackMillis);
}
if (isHedging) {
synchronized (lock) {
state = state.removeActiveHedge(substream);
// The invariant is whether or not #(Potential Hedge + active hedges) > 0.
// Once hasPotentialHedging(state) is false, it will always be false, and then
// #(state.activeHedges) will be decreasing. This guarantees that even there may be
// multiple concurrent hedges, one of the hedges will end up committed.
if (!isFatal) {
if (hasPotentialHedging(state) || !state.activeHedges.isEmpty()) {
return;
}
// else, no activeHedges, no new hedges possible, try to commit
} // else, fatal, try to commit
} }
} }
} }
@ -901,26 +903,10 @@ abstract class RetriableStream<ReqT> implements ClientStream {
boolean shouldRetry = false; boolean shouldRetry = false;
long backoffNanos = 0L; long backoffNanos = 0L;
boolean isRetryableStatusCode = retryPolicy.retryableStatusCodes.contains(status.getCode()); boolean isRetryableStatusCode = retryPolicy.retryableStatusCodes.contains(status.getCode());
boolean isNonFatalStatusCode = hedgingPolicy.nonFatalStatusCodes.contains(status.getCode()); Integer pushbackMillis = getPushbackMills(trailer);
if (isHedging && !isNonFatalStatusCode) {
// isFatal is true, no pushback
return new RetryPlan(/* shouldRetry = */ false, /* isFatal = */ true, 0, null);
}
String pushbackStr = trailer.get(GRPC_RETRY_PUSHBACK_MS);
Integer pushbackMillis = null;
if (pushbackStr != null) {
try {
pushbackMillis = Integer.valueOf(pushbackStr);
} catch (NumberFormatException e) {
pushbackMillis = -1;
}
}
boolean isThrottled = false; boolean isThrottled = false;
if (throttle != null) { if (throttle != null) {
if (isRetryableStatusCode || isNonFatalStatusCode if (isRetryableStatusCode || (pushbackMillis != null && pushbackMillis < 0)) {
|| (pushbackMillis != null && pushbackMillis < 0)) {
isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold(); isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold();
} }
} }
@ -933,7 +919,6 @@ abstract class RetriableStream<ReqT> implements ClientStream {
nextBackoffIntervalNanos = Math.min( nextBackoffIntervalNanos = Math.min(
(long) (nextBackoffIntervalNanos * retryPolicy.backoffMultiplier), (long) (nextBackoffIntervalNanos * retryPolicy.backoffMultiplier),
retryPolicy.maxBackoffNanos); retryPolicy.maxBackoffNanos);
} // else no retry } // else no retry
} else if (pushbackMillis >= 0) { } else if (pushbackMillis >= 0) {
shouldRetry = true; shouldRetry = true;
@ -942,8 +927,33 @@ abstract class RetriableStream<ReqT> implements ClientStream {
} // else no retry } // else no retry
} // else no retry } // else no retry
return new RetryPlan( return new RetryPlan(shouldRetry, backoffNanos);
shouldRetry, /* isFatal = */ false, backoffNanos, isHedging ? pushbackMillis : null); }
private HedgingPlan makeHedgingDecision(Status status, Metadata trailer) {
Integer pushbackMillis = getPushbackMills(trailer);
boolean isFatal = !hedgingPolicy.nonFatalStatusCodes.contains(status.getCode());
boolean isThrottled = false;
if (throttle != null) {
if (!isFatal || (pushbackMillis != null && pushbackMillis < 0)) {
isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold();
}
}
return new HedgingPlan(!isFatal && !isThrottled, pushbackMillis);
}
@Nullable
private Integer getPushbackMills(Metadata trailer) {
String pushbackStr = trailer.get(GRPC_RETRY_PUSHBACK_MS);
Integer pushbackMillis = null;
if (pushbackStr != null) {
try {
pushbackMillis = Integer.valueOf(pushbackStr);
} catch (NumberFormatException e) {
pushbackMillis = -1;
}
}
return pushbackMillis;
} }
@Override @Override
@ -1361,17 +1371,22 @@ abstract class RetriableStream<ReqT> implements ClientStream {
private static final class RetryPlan { private static final class RetryPlan {
final boolean shouldRetry; final boolean shouldRetry;
final boolean isFatal; // receiving a status not among the nonFatalStatusCodes
final long backoffNanos; final long backoffNanos;
RetryPlan(boolean shouldRetry, long backoffNanos) {
this.shouldRetry = shouldRetry;
this.backoffNanos = backoffNanos;
}
}
private static final class HedgingPlan {
final boolean isHedgeable;
@Nullable @Nullable
final Integer hedgingPushbackMillis; final Integer hedgingPushbackMillis;
RetryPlan( public HedgingPlan(
boolean shouldRetry, boolean isFatal, long backoffNanos, boolean isHedgeable, @Nullable Integer hedgingPushbackMillis) {
@Nullable Integer hedgingPushbackMillis) { this.isHedgeable = isHedgeable;
this.shouldRetry = shouldRetry;
this.isFatal = isFatal;
this.backoffNanos = backoffNanos;
this.hedgingPushbackMillis = hedgingPushbackMillis; this.hedgingPushbackMillis = hedgingPushbackMillis;
} }
} }

View File

@ -37,6 +37,7 @@ import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never; import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times; import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoInteractions;
import static org.mockito.Mockito.verifyNoMoreInteractions; import static org.mockito.Mockito.verifyNoMoreInteractions;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
@ -2284,7 +2285,7 @@ public class RetriableStreamTest {
} }
@Test @Test
public void hedging_throttled() { public void hedging_throttledByOtherCall() {
Throttle throttle = new Throttle(4f, 0.8f); Throttle throttle = new Throttle(4f, 0.8f);
RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle); RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle);
@ -2313,6 +2314,37 @@ public class RetriableStreamTest {
assertEquals(0, fakeClock.numPendingTasks()); assertEquals(0, fakeClock.numPendingTasks());
} }
@Test
public void hedging_throttledByHedgingStreams() {
Throttle throttle = new Throttle(4f, 0.8f);
RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle);
ClientStream mockStream1 = mock(ClientStream.class);
ClientStream mockStream2 = mock(ClientStream.class);
ClientStream mockStream3 = mock(ClientStream.class);
when(retriableStreamRecorder.newSubstream(anyInt()))
.thenReturn(mockStream1, mockStream2, mockStream3);
hedgingStream.start(masterListener);
ArgumentCaptor<ClientStreamListener> sublistenerCaptor1 =
ArgumentCaptor.forClass(ClientStreamListener.class);
verify(mockStream1).start(sublistenerCaptor1.capture());
fakeClock.forwardTime(HEDGING_DELAY_IN_SECONDS, TimeUnit.SECONDS);
ArgumentCaptor<ClientStreamListener> sublistenerCaptor2 =
ArgumentCaptor.forClass(ClientStreamListener.class);
verify(mockStream2).start(sublistenerCaptor2.capture());
sublistenerCaptor1.getValue().closed(Status.fromCode(NON_FATAL_STATUS_CODE_1), new Metadata());
assertTrue(throttle.isAboveThreshold()); // count = 3
sublistenerCaptor2.getValue().closed(Status.fromCode(NON_FATAL_STATUS_CODE_1), new Metadata());
assertFalse(throttle.isAboveThreshold()); // count = 2
verify(masterListener).closed(any(Status.class), any(Metadata.class));
verifyNoInteractions(mockStream3);
assertEquals(0, fakeClock.numPendingTasks());
}
/** /**
* Used to stub a retriable stream as well as to record methods of the retriable stream being * Used to stub a retriable stream as well as to record methods of the retriable stream being
* called. * called.