mirror of https://github.com/grpc/grpc-java.git
core: fix a bug for hedging with throttling (#7337)
Resolves #7222: If a hedging substream fails triggering throttling threshold, the call should be committed. Refactored RetryPlan to two separate classes RetryPlan and HedgingPlan.
This commit is contained in:
parent
cb07b0fb45
commit
a91acec2d4
|
|
@ -783,7 +783,6 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (state.winningSubstream == null) {
|
if (state.winningSubstream == null) {
|
||||||
boolean isFatal = false;
|
|
||||||
if (rpcProgress == RpcProgress.REFUSED
|
if (rpcProgress == RpcProgress.REFUSED
|
||||||
&& noMoreTransparentRetry.compareAndSet(false, true)) {
|
&& noMoreTransparentRetry.compareAndSet(false, true)) {
|
||||||
// transparent retry
|
// transparent retry
|
||||||
|
|
@ -837,51 +836,54 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
nextBackoffIntervalNanos = retryPolicy.initialBackoffNanos;
|
nextBackoffIntervalNanos = retryPolicy.initialBackoffNanos;
|
||||||
}
|
}
|
||||||
|
|
||||||
RetryPlan retryPlan = makeRetryDecision(status, trailers);
|
if (isHedging) {
|
||||||
if (retryPlan.shouldRetry) {
|
HedgingPlan hedgingPlan = makeHedgingDecision(status, trailers);
|
||||||
// The check state.winningSubstream == null, checking if is not already committed, is
|
if (hedgingPlan.isHedgeable) {
|
||||||
// racy, but is still safe b/c the retry will also handle committed/cancellation
|
pushbackHedging(hedgingPlan.hedgingPushbackMillis);
|
||||||
FutureCanceller scheduledRetryCopy;
|
|
||||||
synchronized (lock) {
|
|
||||||
scheduledRetry = scheduledRetryCopy = new FutureCanceller(lock);
|
|
||||||
}
|
}
|
||||||
scheduledRetryCopy.setFuture(scheduledExecutorService.schedule(
|
synchronized (lock) {
|
||||||
new Runnable() {
|
state = state.removeActiveHedge(substream);
|
||||||
@Override
|
// The invariant is whether or not #(Potential Hedge + active hedges) > 0.
|
||||||
public void run() {
|
// Once hasPotentialHedging(state) is false, it will always be false, and then
|
||||||
callExecutor.execute(new Runnable() {
|
// #(state.activeHedges) will be decreasing. This guarantees that even there may be
|
||||||
@Override
|
// multiple concurrent hedges, one of the hedges will end up committed.
|
||||||
public void run() {
|
if (hedgingPlan.isHedgeable) {
|
||||||
// retry
|
if (hasPotentialHedging(state) || !state.activeHedges.isEmpty()) {
|
||||||
Substream newSubstream
|
return;
|
||||||
= createSubstream(substream.previousAttemptCount + 1);
|
}
|
||||||
drain(newSubstream);
|
// else, no activeHedges, no new hedges possible, try to commit
|
||||||
}
|
} // else, isHedgeable is false, try to commit
|
||||||
});
|
}
|
||||||
}
|
} else {
|
||||||
},
|
RetryPlan retryPlan = makeRetryDecision(status, trailers);
|
||||||
retryPlan.backoffNanos,
|
if (retryPlan.shouldRetry) {
|
||||||
TimeUnit.NANOSECONDS));
|
// The check state.winningSubstream == null, checking if is not already committed, is
|
||||||
return;
|
// racy, but is still safe b/c the retry will also handle committed/cancellation
|
||||||
}
|
FutureCanceller scheduledRetryCopy;
|
||||||
isFatal = retryPlan.isFatal;
|
synchronized (lock) {
|
||||||
pushbackHedging(retryPlan.hedgingPushbackMillis);
|
scheduledRetry = scheduledRetryCopy = new FutureCanceller(lock);
|
||||||
}
|
|
||||||
|
|
||||||
if (isHedging) {
|
|
||||||
synchronized (lock) {
|
|
||||||
state = state.removeActiveHedge(substream);
|
|
||||||
|
|
||||||
// The invariant is whether or not #(Potential Hedge + active hedges) > 0.
|
|
||||||
// Once hasPotentialHedging(state) is false, it will always be false, and then
|
|
||||||
// #(state.activeHedges) will be decreasing. This guarantees that even there may be
|
|
||||||
// multiple concurrent hedges, one of the hedges will end up committed.
|
|
||||||
if (!isFatal) {
|
|
||||||
if (hasPotentialHedging(state) || !state.activeHedges.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
// else, no activeHedges, no new hedges possible, try to commit
|
scheduledRetryCopy.setFuture(
|
||||||
} // else, fatal, try to commit
|
scheduledExecutorService.schedule(
|
||||||
|
new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
callExecutor.execute(
|
||||||
|
new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
// retry
|
||||||
|
Substream newSubstream =
|
||||||
|
createSubstream(substream.previousAttemptCount + 1);
|
||||||
|
drain(newSubstream);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
retryPlan.backoffNanos,
|
||||||
|
TimeUnit.NANOSECONDS));
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -901,26 +903,10 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
boolean shouldRetry = false;
|
boolean shouldRetry = false;
|
||||||
long backoffNanos = 0L;
|
long backoffNanos = 0L;
|
||||||
boolean isRetryableStatusCode = retryPolicy.retryableStatusCodes.contains(status.getCode());
|
boolean isRetryableStatusCode = retryPolicy.retryableStatusCodes.contains(status.getCode());
|
||||||
boolean isNonFatalStatusCode = hedgingPolicy.nonFatalStatusCodes.contains(status.getCode());
|
Integer pushbackMillis = getPushbackMills(trailer);
|
||||||
if (isHedging && !isNonFatalStatusCode) {
|
|
||||||
// isFatal is true, no pushback
|
|
||||||
return new RetryPlan(/* shouldRetry = */ false, /* isFatal = */ true, 0, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
String pushbackStr = trailer.get(GRPC_RETRY_PUSHBACK_MS);
|
|
||||||
Integer pushbackMillis = null;
|
|
||||||
if (pushbackStr != null) {
|
|
||||||
try {
|
|
||||||
pushbackMillis = Integer.valueOf(pushbackStr);
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
pushbackMillis = -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isThrottled = false;
|
boolean isThrottled = false;
|
||||||
if (throttle != null) {
|
if (throttle != null) {
|
||||||
if (isRetryableStatusCode || isNonFatalStatusCode
|
if (isRetryableStatusCode || (pushbackMillis != null && pushbackMillis < 0)) {
|
||||||
|| (pushbackMillis != null && pushbackMillis < 0)) {
|
|
||||||
isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold();
|
isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -933,7 +919,6 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
nextBackoffIntervalNanos = Math.min(
|
nextBackoffIntervalNanos = Math.min(
|
||||||
(long) (nextBackoffIntervalNanos * retryPolicy.backoffMultiplier),
|
(long) (nextBackoffIntervalNanos * retryPolicy.backoffMultiplier),
|
||||||
retryPolicy.maxBackoffNanos);
|
retryPolicy.maxBackoffNanos);
|
||||||
|
|
||||||
} // else no retry
|
} // else no retry
|
||||||
} else if (pushbackMillis >= 0) {
|
} else if (pushbackMillis >= 0) {
|
||||||
shouldRetry = true;
|
shouldRetry = true;
|
||||||
|
|
@ -942,8 +927,33 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
} // else no retry
|
} // else no retry
|
||||||
} // else no retry
|
} // else no retry
|
||||||
|
|
||||||
return new RetryPlan(
|
return new RetryPlan(shouldRetry, backoffNanos);
|
||||||
shouldRetry, /* isFatal = */ false, backoffNanos, isHedging ? pushbackMillis : null);
|
}
|
||||||
|
|
||||||
|
private HedgingPlan makeHedgingDecision(Status status, Metadata trailer) {
|
||||||
|
Integer pushbackMillis = getPushbackMills(trailer);
|
||||||
|
boolean isFatal = !hedgingPolicy.nonFatalStatusCodes.contains(status.getCode());
|
||||||
|
boolean isThrottled = false;
|
||||||
|
if (throttle != null) {
|
||||||
|
if (!isFatal || (pushbackMillis != null && pushbackMillis < 0)) {
|
||||||
|
isThrottled = !throttle.onQualifiedFailureThenCheckIsAboveThreshold();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new HedgingPlan(!isFatal && !isThrottled, pushbackMillis);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private Integer getPushbackMills(Metadata trailer) {
|
||||||
|
String pushbackStr = trailer.get(GRPC_RETRY_PUSHBACK_MS);
|
||||||
|
Integer pushbackMillis = null;
|
||||||
|
if (pushbackStr != null) {
|
||||||
|
try {
|
||||||
|
pushbackMillis = Integer.valueOf(pushbackStr);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
pushbackMillis = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pushbackMillis;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -1361,17 +1371,22 @@ abstract class RetriableStream<ReqT> implements ClientStream {
|
||||||
|
|
||||||
private static final class RetryPlan {
|
private static final class RetryPlan {
|
||||||
final boolean shouldRetry;
|
final boolean shouldRetry;
|
||||||
final boolean isFatal; // receiving a status not among the nonFatalStatusCodes
|
|
||||||
final long backoffNanos;
|
final long backoffNanos;
|
||||||
|
|
||||||
|
RetryPlan(boolean shouldRetry, long backoffNanos) {
|
||||||
|
this.shouldRetry = shouldRetry;
|
||||||
|
this.backoffNanos = backoffNanos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final class HedgingPlan {
|
||||||
|
final boolean isHedgeable;
|
||||||
@Nullable
|
@Nullable
|
||||||
final Integer hedgingPushbackMillis;
|
final Integer hedgingPushbackMillis;
|
||||||
|
|
||||||
RetryPlan(
|
public HedgingPlan(
|
||||||
boolean shouldRetry, boolean isFatal, long backoffNanos,
|
boolean isHedgeable, @Nullable Integer hedgingPushbackMillis) {
|
||||||
@Nullable Integer hedgingPushbackMillis) {
|
this.isHedgeable = isHedgeable;
|
||||||
this.shouldRetry = shouldRetry;
|
|
||||||
this.isFatal = isFatal;
|
|
||||||
this.backoffNanos = backoffNanos;
|
|
||||||
this.hedgingPushbackMillis = hedgingPushbackMillis;
|
this.hedgingPushbackMillis = hedgingPushbackMillis;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.never;
|
import static org.mockito.Mockito.never;
|
||||||
import static org.mockito.Mockito.times;
|
import static org.mockito.Mockito.times;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.verifyNoInteractions;
|
||||||
import static org.mockito.Mockito.verifyNoMoreInteractions;
|
import static org.mockito.Mockito.verifyNoMoreInteractions;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
|
@ -2284,7 +2285,7 @@ public class RetriableStreamTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void hedging_throttled() {
|
public void hedging_throttledByOtherCall() {
|
||||||
Throttle throttle = new Throttle(4f, 0.8f);
|
Throttle throttle = new Throttle(4f, 0.8f);
|
||||||
RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle);
|
RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle);
|
||||||
|
|
||||||
|
|
@ -2313,6 +2314,37 @@ public class RetriableStreamTest {
|
||||||
assertEquals(0, fakeClock.numPendingTasks());
|
assertEquals(0, fakeClock.numPendingTasks());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void hedging_throttledByHedgingStreams() {
|
||||||
|
Throttle throttle = new Throttle(4f, 0.8f);
|
||||||
|
RetriableStream<String> hedgingStream = newThrottledHedgingStream(throttle);
|
||||||
|
|
||||||
|
ClientStream mockStream1 = mock(ClientStream.class);
|
||||||
|
ClientStream mockStream2 = mock(ClientStream.class);
|
||||||
|
ClientStream mockStream3 = mock(ClientStream.class);
|
||||||
|
when(retriableStreamRecorder.newSubstream(anyInt()))
|
||||||
|
.thenReturn(mockStream1, mockStream2, mockStream3);
|
||||||
|
|
||||||
|
hedgingStream.start(masterListener);
|
||||||
|
ArgumentCaptor<ClientStreamListener> sublistenerCaptor1 =
|
||||||
|
ArgumentCaptor.forClass(ClientStreamListener.class);
|
||||||
|
verify(mockStream1).start(sublistenerCaptor1.capture());
|
||||||
|
|
||||||
|
fakeClock.forwardTime(HEDGING_DELAY_IN_SECONDS, TimeUnit.SECONDS);
|
||||||
|
ArgumentCaptor<ClientStreamListener> sublistenerCaptor2 =
|
||||||
|
ArgumentCaptor.forClass(ClientStreamListener.class);
|
||||||
|
verify(mockStream2).start(sublistenerCaptor2.capture());
|
||||||
|
|
||||||
|
sublistenerCaptor1.getValue().closed(Status.fromCode(NON_FATAL_STATUS_CODE_1), new Metadata());
|
||||||
|
assertTrue(throttle.isAboveThreshold()); // count = 3
|
||||||
|
sublistenerCaptor2.getValue().closed(Status.fromCode(NON_FATAL_STATUS_CODE_1), new Metadata());
|
||||||
|
assertFalse(throttle.isAboveThreshold()); // count = 2
|
||||||
|
|
||||||
|
verify(masterListener).closed(any(Status.class), any(Metadata.class));
|
||||||
|
verifyNoInteractions(mockStream3);
|
||||||
|
assertEquals(0, fakeClock.numPendingTasks());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used to stub a retriable stream as well as to record methods of the retriable stream being
|
* Used to stub a retriable stream as well as to record methods of the retriable stream being
|
||||||
* called.
|
* called.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue