client: fix potential panic during RPC retries (#5323)

This commit is contained in:
Doug Fawley 2022-05-04 10:06:12 -07:00 committed by GitHub
parent 78b13f27de
commit 799605c228
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 229 additions and 152 deletions

View File

@ -907,14 +907,10 @@ func (cc *ClientConn) healthCheckConfig() *healthCheckConfig {
}
func (cc *ClientConn) getTransport(ctx context.Context, failfast bool, method string) (transport.ClientTransport, func(balancer.DoneInfo), error) {
t, done, err := cc.blockingpicker.pick(ctx, failfast, balancer.PickInfo{
return cc.blockingpicker.pick(ctx, failfast, balancer.PickInfo{
Ctx: ctx,
FullMethodName: method,
})
if err != nil {
return nil, nil, toRPCErr(err)
}
return t, done, nil
}
func (cc *ClientConn) applyServiceConfigAndBalancer(sc *ServiceConfig, configSelector iresolver.ConfigSelector, addrs []resolver.Address) {

View File

@ -631,8 +631,8 @@ func (t *http2Client) getCallAuthData(ctx context.Context, audience string, call
// the wire. However, there are two notable exceptions:
//
// 1. If the stream headers violate the max header list size allowed by the
// server. In this case there is no reason to retry at all, as it is
// assumed the RPC would continue to fail on subsequent attempts.
// server. It's possible this could succeed on another transport, even if
// it's unlikely, but do not transparently retry.
// 2. If the credentials errored when requesting their headers. In this case,
// it's possible a retry can fix the problem, but indefinitely transparently
// retrying is not appropriate as it is likely the credentials, if they can
@ -640,8 +640,7 @@ func (t *http2Client) getCallAuthData(ctx context.Context, audience string, call
type NewStreamError struct {
Err error
DoNotRetry bool
DoNotTransparentRetry bool
AllowTransparentRetry bool
}
func (e NewStreamError) Error() string {
@ -650,11 +649,11 @@ func (e NewStreamError) Error() string {
// NewStream creates a stream and registers it into the transport as "active"
// streams. All non-nil errors returned will be *NewStreamError.
func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (_ *Stream, err error) {
func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (*Stream, error) {
ctx = peer.NewContext(ctx, t.getPeer())
headerFields, err := t.createHeaderFields(ctx, callHdr)
if err != nil {
return nil, &NewStreamError{Err: err, DoNotTransparentRetry: true}
return nil, &NewStreamError{Err: err, AllowTransparentRetry: false}
}
s := t.newStream(ctx, callHdr)
cleanup := func(err error) {
@ -754,13 +753,14 @@ func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (_ *Strea
return true
}, hdr)
if err != nil {
return nil, &NewStreamError{Err: err}
// Connection closed.
return nil, &NewStreamError{Err: err, AllowTransparentRetry: true}
}
if success {
break
}
if hdrListSizeErr != nil {
return nil, &NewStreamError{Err: hdrListSizeErr, DoNotRetry: true}
return nil, &NewStreamError{Err: hdrListSizeErr}
}
firstTry = false
select {
@ -768,9 +768,9 @@ func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (_ *Strea
case <-ctx.Done():
return nil, &NewStreamError{Err: ContextErr(ctx.Err())}
case <-t.goAway:
return nil, &NewStreamError{Err: errStreamDrain}
return nil, &NewStreamError{Err: errStreamDrain, AllowTransparentRetry: true}
case <-t.ctx.Done():
return nil, &NewStreamError{Err: ErrConnClosing}
return nil, &NewStreamError{Err: ErrConnClosing, AllowTransparentRetry: true}
}
}
if t.statsHandler != nil {

View File

@ -131,7 +131,7 @@ func (pw *pickerWrapper) pick(ctx context.Context, failfast bool, info balancer.
}
if _, ok := status.FromError(err); ok {
// Status error: end the RPC unconditionally with this status.
return nil, nil, err
return nil, nil, dropError{error: err}
}
// For all other errors, wait for ready RPCs should block and other
// RPCs should fail with unavailable.
@ -175,3 +175,9 @@ func (pw *pickerWrapper) close() {
pw.done = true
close(pw.blockingCh)
}
// dropError is a wrapper error that indicates the LB policy wishes to drop the
// RPC and not retry it.
type dropError struct {
error
}

197
stream.go
View File

@ -303,14 +303,28 @@ func newClientStreamWithParams(ctx context.Context, desc *StreamDesc, cc *Client
}
cs.binlog = binarylog.GetMethodLogger(method)
if err := cs.newAttemptLocked(false /* isTransparent */); err != nil {
cs.attempt, err = cs.newAttemptLocked(false /* isTransparent */)
if err != nil {
cs.finish(err)
return nil, err
}
op := func(a *csAttempt) error { return a.newStream() }
// Pick the transport to use and create a new stream on the transport.
// Assign cs.attempt upon success.
op := func(a *csAttempt) error {
if err := a.getTransport(); err != nil {
return err
}
if err := a.newStream(); err != nil {
return err
}
// Because this operation is always called either here (while creating
// the clientStream) or by the retry code while locked when replaying
// the operation, it is safe to access cs.attempt directly.
cs.attempt = a
return nil
}
if err := cs.withRetry(op, func() { cs.bufferForRetryLocked(0, op) }); err != nil {
cs.finish(err)
return nil, err
}
@ -349,9 +363,15 @@ func newClientStreamWithParams(ctx context.Context, desc *StreamDesc, cc *Client
return cs, nil
}
// newAttemptLocked creates a new attempt with a transport.
// If it succeeds, then it replaces clientStream's attempt with this new attempt.
func (cs *clientStream) newAttemptLocked(isTransparent bool) (retErr error) {
// newAttemptLocked creates a new csAttempt without a transport or stream.
func (cs *clientStream) newAttemptLocked(isTransparent bool) (*csAttempt, error) {
if err := cs.ctx.Err(); err != nil {
return nil, toRPCErr(err)
}
if err := cs.cc.ctx.Err(); err != nil {
return nil, ErrClientConnClosing
}
ctx := newContextWithRPCInfo(cs.ctx, cs.callInfo.failFast, cs.callInfo.codec, cs.cp, cs.comp)
method := cs.callHdr.Method
sh := cs.cc.dopts.copts.StatsHandler
@ -385,27 +405,6 @@ func (cs *clientStream) newAttemptLocked(isTransparent bool) (retErr error) {
ctx = trace.NewContext(ctx, trInfo.tr)
}
newAttempt := &csAttempt{
ctx: ctx,
beginTime: beginTime,
cs: cs,
dc: cs.cc.dopts.dc,
statsHandler: sh,
trInfo: trInfo,
}
defer func() {
if retErr != nil {
// This attempt is not set in the clientStream, so it's finish won't
// be called. Call it here for stats and trace in case they are not
// nil.
newAttempt.finish(retErr)
}
}()
if err := ctx.Err(); err != nil {
return toRPCErr(err)
}
if cs.cc.parsedTarget.Scheme == "xds" {
// Add extra metadata (metadata that will be added by transport) to context
// so the balancer can see them.
@ -413,16 +412,32 @@ func (cs *clientStream) newAttemptLocked(isTransparent bool) (retErr error) {
"content-type", grpcutil.ContentType(cs.callHdr.ContentSubtype),
))
}
t, done, err := cs.cc.getTransport(ctx, cs.callInfo.failFast, cs.callHdr.Method)
return &csAttempt{
ctx: ctx,
beginTime: beginTime,
cs: cs,
dc: cs.cc.dopts.dc,
statsHandler: sh,
trInfo: trInfo,
}, nil
}
func (a *csAttempt) getTransport() error {
cs := a.cs
var err error
a.t, a.done, err = cs.cc.getTransport(a.ctx, cs.callInfo.failFast, cs.callHdr.Method)
if err != nil {
if de, ok := err.(dropError); ok {
err = de.error
a.drop = true
}
return err
}
if trInfo != nil {
trInfo.firstLine.SetRemoteAddr(t.RemoteAddr())
if a.trInfo != nil {
a.trInfo.firstLine.SetRemoteAddr(a.t.RemoteAddr())
}
newAttempt.t = t
newAttempt.done = done
cs.attempt = newAttempt
return nil
}
@ -431,12 +446,21 @@ func (a *csAttempt) newStream() error {
cs.callHdr.PreviousAttempts = cs.numRetries
s, err := a.t.NewStream(a.ctx, cs.callHdr)
if err != nil {
// Return without converting to an RPC error so retry code can
// inspect.
return err
nse, ok := err.(*transport.NewStreamError)
if !ok {
// Unexpected.
return err
}
if nse.AllowTransparentRetry {
a.allowTransparentRetry = true
}
// Unwrap and convert error.
return toRPCErr(nse.Err)
}
cs.attempt.s = s
cs.attempt.p = &parser{r: s}
a.s = s
a.p = &parser{r: s}
return nil
}
@ -514,6 +538,11 @@ type csAttempt struct {
statsHandler stats.Handler
beginTime time.Time
// set for newStream errors that may be transparently retried
allowTransparentRetry bool
// set for pick errors that are returned as a status
drop bool
}
func (cs *clientStream) commitAttemptLocked() {
@ -533,41 +562,21 @@ func (cs *clientStream) commitAttempt() {
// shouldRetry returns nil if the RPC should be retried; otherwise it returns
// the error that should be returned by the operation. If the RPC should be
// retried, the bool indicates whether it is being retried transparently.
func (cs *clientStream) shouldRetry(err error) (bool, error) {
if cs.attempt.s == nil {
// Error from NewClientStream.
nse, ok := err.(*transport.NewStreamError)
if !ok {
// Unexpected, but assume no I/O was performed and the RPC is not
// fatal, so retry indefinitely.
return true, nil
}
func (a *csAttempt) shouldRetry(err error) (bool, error) {
cs := a.cs
// Unwrap and convert error.
err = toRPCErr(nse.Err)
// Never retry DoNotRetry errors, which indicate the RPC should not be
// retried due to max header list size violation, etc.
if nse.DoNotRetry {
return false, err
}
// In the event of a non-IO operation error from NewStream, we never
// attempted to write anything to the wire, so we can retry
// indefinitely.
if !nse.DoNotTransparentRetry {
return true, nil
}
}
if cs.finished || cs.committed {
// RPC is finished or committed; cannot retry.
if cs.finished || cs.committed || a.drop {
// RPC is finished or committed or was dropped by the picker; cannot retry.
return false, err
}
if a.s == nil && a.allowTransparentRetry {
return true, nil
}
// Wait for the trailers.
unprocessed := false
if cs.attempt.s != nil {
<-cs.attempt.s.Done()
unprocessed = cs.attempt.s.Unprocessed()
if a.s != nil {
<-a.s.Done()
unprocessed = a.s.Unprocessed()
}
if cs.firstAttempt && unprocessed {
// First attempt, stream unprocessed: transparently retry.
@ -579,14 +588,14 @@ func (cs *clientStream) shouldRetry(err error) (bool, error) {
pushback := 0
hasPushback := false
if cs.attempt.s != nil {
if !cs.attempt.s.TrailersOnly() {
if a.s != nil {
if !a.s.TrailersOnly() {
return false, err
}
// TODO(retry): Move down if the spec changes to not check server pushback
// before considering this a failure for throttling.
sps := cs.attempt.s.Trailer()["grpc-retry-pushback-ms"]
sps := a.s.Trailer()["grpc-retry-pushback-ms"]
if len(sps) == 1 {
var e error
if pushback, e = strconv.Atoi(sps[0]); e != nil || pushback < 0 {
@ -603,10 +612,10 @@ func (cs *clientStream) shouldRetry(err error) (bool, error) {
}
var code codes.Code
if cs.attempt.s != nil {
code = cs.attempt.s.Status().Code()
if a.s != nil {
code = a.s.Status().Code()
} else {
code = status.Convert(err).Code()
code = status.Code(err)
}
rp := cs.methodConfig.RetryPolicy
@ -651,19 +660,24 @@ func (cs *clientStream) shouldRetry(err error) (bool, error) {
}
// Returns nil if a retry was performed and succeeded; error otherwise.
func (cs *clientStream) retryLocked(lastErr error) error {
func (cs *clientStream) retryLocked(attempt *csAttempt, lastErr error) error {
for {
cs.attempt.finish(toRPCErr(lastErr))
isTransparent, err := cs.shouldRetry(lastErr)
attempt.finish(toRPCErr(lastErr))
isTransparent, err := attempt.shouldRetry(lastErr)
if err != nil {
cs.commitAttemptLocked()
return err
}
cs.firstAttempt = false
if err := cs.newAttemptLocked(isTransparent); err != nil {
attempt, err = cs.newAttemptLocked(isTransparent)
if err != nil {
// Only returns error if the clientconn is closed or the context of
// the stream is canceled.
return err
}
if lastErr = cs.replayBufferLocked(); lastErr == nil {
// Note that the first op in the replay buffer always sets cs.attempt
// if it is able to pick a transport and create a stream.
if lastErr = cs.replayBufferLocked(attempt); lastErr == nil {
return nil
}
}
@ -673,7 +687,10 @@ func (cs *clientStream) Context() context.Context {
cs.commitAttempt()
// No need to lock before using attempt, since we know it is committed and
// cannot change.
return cs.attempt.s.Context()
if cs.attempt.s != nil {
return cs.attempt.s.Context()
}
return cs.ctx
}
func (cs *clientStream) withRetry(op func(a *csAttempt) error, onSuccess func()) error {
@ -703,7 +720,7 @@ func (cs *clientStream) withRetry(op func(a *csAttempt) error, onSuccess func())
cs.mu.Unlock()
return err
}
if err := cs.retryLocked(err); err != nil {
if err := cs.retryLocked(a, err); err != nil {
cs.mu.Unlock()
return err
}
@ -734,7 +751,7 @@ func (cs *clientStream) Header() (metadata.MD, error) {
cs.binlog.Log(logEntry)
cs.serverHeaderBinlogged = true
}
return m, err
return m, nil
}
func (cs *clientStream) Trailer() metadata.MD {
@ -752,10 +769,9 @@ func (cs *clientStream) Trailer() metadata.MD {
return cs.attempt.s.Trailer()
}
func (cs *clientStream) replayBufferLocked() error {
a := cs.attempt
func (cs *clientStream) replayBufferLocked(attempt *csAttempt) error {
for _, f := range cs.buffer {
if err := f(a); err != nil {
if err := f(attempt); err != nil {
return err
}
}
@ -803,22 +819,17 @@ func (cs *clientStream) SendMsg(m interface{}) (err error) {
if len(payload) > *cs.callInfo.maxSendMessageSize {
return status.Errorf(codes.ResourceExhausted, "trying to send message larger than max (%d vs. %d)", len(payload), *cs.callInfo.maxSendMessageSize)
}
msgBytes := data // Store the pointer before setting to nil. For binary logging.
op := func(a *csAttempt) error {
err := a.sendMsg(m, hdr, payload, data)
// nil out the message and uncomp when replaying; they are only needed for
// stats which is disabled for subsequent attempts.
m, data = nil, nil
return err
return a.sendMsg(m, hdr, payload, data)
}
err = cs.withRetry(op, func() { cs.bufferForRetryLocked(len(hdr)+len(payload), op) })
if cs.binlog != nil && err == nil {
cs.binlog.Log(&binarylog.ClientMessage{
OnClientSide: true,
Message: msgBytes,
Message: data,
})
}
return
return err
}
func (cs *clientStream) RecvMsg(m interface{}) error {

View File

@ -1508,7 +1508,7 @@ func testFailFast(t *testing.T, e env) {
cc := te.clientConn()
tc := testpb.NewTestServiceClient(cc)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()
if _, err := tc.EmptyCall(ctx, &testpb.Empty{}); err != nil {
t.Fatalf("TestService/EmptyCall(_, _) = _, %v, want _, <nil>", err)
@ -1517,9 +1517,10 @@ func testFailFast(t *testing.T, e env) {
te.srv.Stop()
// Loop until the server teardown is propagated to the client.
for {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := ctx.Err(); err != nil {
t.Fatalf("EmptyCall did not return UNAVAILABLE before timeout")
}
_, err := tc.EmptyCall(ctx, &testpb.Empty{})
cancel()
if status.Code(err) == codes.Unavailable {
break
}

View File

@ -34,6 +34,7 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/internal/grpcsync"
"google.golang.org/grpc/internal/stubserver"
"google.golang.org/grpc/metadata"
"google.golang.org/grpc/stats"
@ -44,7 +45,8 @@ import (
func (s) TestRetryUnary(t *testing.T) {
i := -1
ss := &stubserver.StubServer{
EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
EmptyCallF: func(context.Context, *testpb.Empty) (r *testpb.Empty, err error) {
defer func() { t.Logf("server call %v returning err %v", i, err) }()
i++
switch i {
case 0, 2, 5:
@ -55,11 +57,8 @@ func (s) TestRetryUnary(t *testing.T) {
return nil, status.New(codes.AlreadyExists, "retryable error").Err()
},
}
if err := ss.Start([]grpc.ServerOption{}); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
defer ss.Stop()
ss.NewServiceConfig(`{
if err := ss.Start([]grpc.ServerOption{},
grpc.WithDefaultServiceConfig(`{
"methodConfig": [{
"name": [{"service": "grpc.testing.TestService"}],
"waitForReady": true,
@ -70,18 +69,10 @@ func (s) TestRetryUnary(t *testing.T) {
"BackoffMultiplier": 1.0,
"RetryableStatusCodes": [ "ALREADY_EXISTS" ]
}
}]}`)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
for {
if ctx.Err() != nil {
t.Fatalf("Timed out waiting for service config update")
}
if ss.CC.GetMethodConfig("/grpc.testing.TestService/EmptyCall").WaitForReady != nil {
break
}
time.Sleep(time.Millisecond)
}]}`)); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
cancel()
defer ss.Stop()
testCases := []struct {
code codes.Code
@ -95,7 +86,8 @@ func (s) TestRetryUnary(t *testing.T) {
{codes.Internal, 11},
{codes.AlreadyExists, 15},
}
for _, tc := range testCases {
for num, tc := range testCases {
t.Log("Case", num)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
_, err := ss.Client.EmptyCall(ctx, &testpb.Empty{})
cancel()
@ -120,11 +112,8 @@ func (s) TestRetryThrottling(t *testing.T) {
return nil, status.New(codes.Unavailable, "retryable error").Err()
},
}
if err := ss.Start([]grpc.ServerOption{}); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
defer ss.Stop()
ss.NewServiceConfig(`{
if err := ss.Start([]grpc.ServerOption{},
grpc.WithDefaultServiceConfig(`{
"methodConfig": [{
"name": [{"service": "grpc.testing.TestService"}],
"waitForReady": true,
@ -140,18 +129,10 @@ func (s) TestRetryThrottling(t *testing.T) {
"maxTokens": 10,
"tokenRatio": 0.5
}
}`)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
for {
if ctx.Err() != nil {
t.Fatalf("Timed out waiting for service config update")
}
if ss.CC.GetMethodConfig("/grpc.testing.TestService/EmptyCall").WaitForReady != nil {
break
}
time.Sleep(time.Millisecond)
}`)); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
cancel()
defer ss.Stop()
testCases := []struct {
code codes.Code
@ -430,11 +411,8 @@ func (s) TestRetryStreaming(t *testing.T) {
return nil
},
}
if err := ss.Start([]grpc.ServerOption{}, grpc.WithDefaultCallOptions(grpc.MaxRetryRPCBufferSize(200))); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
defer ss.Stop()
ss.NewServiceConfig(`{
if err := ss.Start([]grpc.ServerOption{}, grpc.WithDefaultCallOptions(grpc.MaxRetryRPCBufferSize(200)),
grpc.WithDefaultServiceConfig(`{
"methodConfig": [{
"name": [{"service": "grpc.testing.TestService"}],
"waitForReady": true,
@ -445,7 +423,10 @@ func (s) TestRetryStreaming(t *testing.T) {
"BackoffMultiplier": 1.0,
"RetryableStatusCodes": [ "UNAVAILABLE" ]
}
}]}`)
}]}`)); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
defer ss.Stop()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
for {
if ctx.Err() != nil {
@ -644,3 +625,85 @@ func (s) TestRetryStats(t *testing.T) {
t.Fatalf("pushback time before final attempt = %v; want ~10ms", diff)
}
}
func (s) TestRetryTransparentWhenCommitted(t *testing.T) {
// With MaxConcurrentStreams=1:
//
// 1. Create stream 1 that is retriable.
// 2. Stream 1 is created and fails with a retriable code.
// 3. Create dummy stream 2, blocking indefinitely.
// 4. Stream 1 retries (and blocks until stream 2 finishes)
// 5. Stream 1 is canceled manually.
//
// If there is no bug, the stream is done and errors with CANCELED. With a bug:
//
// 6. Stream 1 has a nil stream (attempt.s). Operations like CloseSend will panic.
first := grpcsync.NewEvent()
ss := &stubserver.StubServer{
FullDuplexCallF: func(stream testpb.TestService_FullDuplexCallServer) error {
// signal?
if !first.HasFired() {
first.Fire()
t.Log("returned first error")
return status.Error(codes.AlreadyExists, "first attempt fails and is retriable")
}
t.Log("blocking")
<-stream.Context().Done()
return stream.Context().Err()
},
}
if err := ss.Start([]grpc.ServerOption{grpc.MaxConcurrentStreams(1)},
grpc.WithDefaultServiceConfig(`{
"methodConfig": [{
"name": [{"service": "grpc.testing.TestService"}],
"waitForReady": true,
"retryPolicy": {
"MaxAttempts": 2,
"InitialBackoff": ".1s",
"MaxBackoff": ".1s",
"BackoffMultiplier": 1.0,
"RetryableStatusCodes": [ "ALREADY_EXISTS" ]
}
}]}`)); err != nil {
t.Fatalf("Error starting endpoint server: %v", err)
}
defer ss.Stop()
ctx1, cancel1 := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel1()
ctx2, cancel2 := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel2()
stream1, err := ss.Client.FullDuplexCall(ctx1)
if err != nil {
t.Fatalf("Error creating stream 1: %v", err)
}
// Create dummy stream to block indefinitely.
_, err = ss.Client.FullDuplexCall(ctx2)
if err != nil {
t.Errorf("Error creating stream 2: %v", err)
}
stream1Closed := grpcsync.NewEvent()
go func() {
_, err := stream1.Recv()
// Will trigger a retry when it sees the ALREADY_EXISTS error
if status.Code(err) != codes.Canceled {
t.Errorf("Expected stream1 to be canceled; got error: %v", err)
}
stream1Closed.Fire()
}()
// Wait longer than the retry backoff timer.
time.Sleep(200 * time.Millisecond)
cancel1()
// Operations on the stream should not panic.
<-stream1Closed.Done()
stream1.CloseSend()
stream1.Recv()
stream1.Send(&testpb.StreamingOutputCallRequest{})
}