Disable fail-fast for gRPC. (#2397)
This allows us to restart backends with relatively little interruption in service, provided the backends come up promptly. Fixes #2389 and #2408
This commit is contained in:
parent
3cff6babb3
commit
5b865f1d63
|
|
@ -13,6 +13,8 @@ import (
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// serverInterceptor is a gRPC interceptor that adds statsd and Prometheus
|
||||||
|
// metrics to requests handled by a gRPC server.
|
||||||
type serverInterceptor struct {
|
type serverInterceptor struct {
|
||||||
stats metrics.Scope
|
stats metrics.Scope
|
||||||
clk clock.Clock
|
clk clock.Clock
|
||||||
|
|
@ -48,6 +50,13 @@ func (si *serverInterceptor) intercept(ctx context.Context, req interface{}, inf
|
||||||
return resp, err
|
return resp, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// clientInterceptor is a gRPC interceptor that adds statsd and Prometheus
|
||||||
|
// metrics to sent requests, and disables FailFast. We disable FailFast because
|
||||||
|
// non-FailFast mode is most similar to the old AMQP RPC layer: If a client
|
||||||
|
// makes a request while all backends are briefly down (e.g. for a restart), the
|
||||||
|
// request doesn't necessarily fail. A backend can service the request if it
|
||||||
|
// comes back up within the timeout. Under gRPC the same effect is achieved by
|
||||||
|
// retries up to the Context deadline.
|
||||||
type clientInterceptor struct {
|
type clientInterceptor struct {
|
||||||
stats metrics.Scope
|
stats metrics.Scope
|
||||||
clk clock.Clock
|
clk clock.Clock
|
||||||
|
|
@ -70,6 +79,9 @@ func (ci *clientInterceptor) intercept(
|
||||||
methodScope := ci.stats.NewScope(cleanMethod(method, false))
|
methodScope := ci.stats.NewScope(cleanMethod(method, false))
|
||||||
methodScope.Inc("Calls", 1)
|
methodScope.Inc("Calls", 1)
|
||||||
methodScope.GaugeDelta("InProgress", 1)
|
methodScope.GaugeDelta("InProgress", 1)
|
||||||
|
// Disable fail-fast so RPCs will retry until deadline, even if all backends
|
||||||
|
// are down.
|
||||||
|
opts = append(opts, grpc.FailFast(false))
|
||||||
err := grpc_prometheus.UnaryClientInterceptor(localCtx, method, req, reply, cc, invoker, opts...)
|
err := grpc_prometheus.UnaryClientInterceptor(localCtx, method, req, reply, cc, invoker, opts...)
|
||||||
methodScope.TimingDuration("Latency", ci.clk.Since(s))
|
methodScope.TimingDuration("Latency", ci.clk.Since(s))
|
||||||
methodScope.GaugeDelta("InProgress", -1)
|
methodScope.GaugeDelta("InProgress", -1)
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import (
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
|
"github.com/letsencrypt/boulder/grpc/test_proto"
|
||||||
"github.com/letsencrypt/boulder/metrics"
|
"github.com/letsencrypt/boulder/metrics"
|
||||||
"github.com/letsencrypt/boulder/test"
|
"github.com/letsencrypt/boulder/test"
|
||||||
)
|
)
|
||||||
|
|
@ -24,7 +25,7 @@ func testHandler(_ context.Context, i interface{}) (interface{}, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func testInvoker(_ context.Context, method string, _, _ interface{}, _ *grpc.ClientConn, _ ...grpc.CallOption) error {
|
func testInvoker(_ context.Context, method string, _, _ interface{}, _ *grpc.ClientConn, opts ...grpc.CallOption) error {
|
||||||
if method == "-service-brokeTest" {
|
if method == "-service-brokeTest" {
|
||||||
return errors.New("")
|
return errors.New("")
|
||||||
}
|
}
|
||||||
|
|
@ -82,6 +83,44 @@ func TestClientInterceptor(t *testing.T) {
|
||||||
test.AssertError(t, err, "ci.intercept didn't fail when handler returned a error")
|
test.AssertError(t, err, "ci.intercept didn't fail when handler returned a error")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// testServer is used to implement InterceptorTest
|
||||||
|
type testServer struct{}
|
||||||
|
|
||||||
|
// Chill implements InterceptorTest.Chill
|
||||||
|
func (s *testServer) Chill(ctx context.Context, in *test_proto.Time) (*test_proto.Time, error) {
|
||||||
|
start := time.Now()
|
||||||
|
time.Sleep(time.Duration(*in.Time) * time.Nanosecond)
|
||||||
|
spent := int64(time.Since(start) / time.Nanosecond)
|
||||||
|
return &test_proto.Time{Time: &spent}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestFailFastFalse sends a gRPC request to a backend that is
|
||||||
|
// unavailable, and ensures that the request doesn't error out until the
|
||||||
|
// timeout is reached, i.e. that FailFast is set to false.
|
||||||
|
// https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md
|
||||||
|
func TestFailFastFalse(t *testing.T) {
|
||||||
|
ci := &clientInterceptor{metrics.NewNoopScope(), clock.Default(), 100 * time.Millisecond}
|
||||||
|
conn, err := grpc.Dial("localhost:19876", // random, probably unused port
|
||||||
|
grpc.WithInsecure(),
|
||||||
|
grpc.WithBalancer(grpc.RoundRobin(newStaticResolver([]string{"localhost:19000"}))),
|
||||||
|
grpc.WithUnaryInterceptor(ci.intercept))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("did not connect: %v", err)
|
||||||
|
}
|
||||||
|
c := test_proto.NewChillerClient(conn)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
var second int64 = time.Second.Nanoseconds()
|
||||||
|
_, err = c.Chill(context.Background(), &test_proto.Time{Time: &second})
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("Successful Chill when we expected failure.")
|
||||||
|
}
|
||||||
|
if time.Since(start) < 90*time.Millisecond {
|
||||||
|
t.Errorf("Chill failed fast, when FailFast should be disabled.")
|
||||||
|
}
|
||||||
|
_ = conn.Close()
|
||||||
|
}
|
||||||
|
|
||||||
func TestCleanMethod(t *testing.T) {
|
func TestCleanMethod(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
in string
|
in string
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
package test_proto
|
||||||
|
|
||||||
|
//go:generate sh -c "cd ../.. && protoc --go_out=plugins=grpc,Mcore/proto/core.proto=github.com/letsencrypt/boulder/grpc/test_proto:. grpc/test_proto/interceptors_test.proto"
|
||||||
|
|
@ -0,0 +1,143 @@
|
||||||
|
// Code generated by protoc-gen-go.
|
||||||
|
// source: grpc/test_proto/interceptors_test.proto
|
||||||
|
// DO NOT EDIT!
|
||||||
|
|
||||||
|
/*
|
||||||
|
Package test_proto is a generated protocol buffer package.
|
||||||
|
|
||||||
|
It is generated from these files:
|
||||||
|
grpc/test_proto/interceptors_test.proto
|
||||||
|
|
||||||
|
It has these top-level messages:
|
||||||
|
Time
|
||||||
|
*/
|
||||||
|
package test_proto
|
||||||
|
|
||||||
|
import proto "github.com/golang/protobuf/proto"
|
||||||
|
import fmt "fmt"
|
||||||
|
import math "math"
|
||||||
|
|
||||||
|
import (
|
||||||
|
context "golang.org/x/net/context"
|
||||||
|
grpc "google.golang.org/grpc"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Reference imports to suppress errors if they are not otherwise used.
|
||||||
|
var _ = proto.Marshal
|
||||||
|
var _ = fmt.Errorf
|
||||||
|
var _ = math.Inf
|
||||||
|
|
||||||
|
// This is a compile-time assertion to ensure that this generated file
|
||||||
|
// is compatible with the proto package it is being compiled against.
|
||||||
|
// A compilation error at this line likely means your copy of the
|
||||||
|
// proto package needs to be updated.
|
||||||
|
const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
|
||||||
|
|
||||||
|
type Time struct {
|
||||||
|
Time *int64 `protobuf:"varint,1,opt,name=time" json:"time,omitempty"`
|
||||||
|
XXX_unrecognized []byte `json:"-"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Time) Reset() { *m = Time{} }
|
||||||
|
func (m *Time) String() string { return proto.CompactTextString(m) }
|
||||||
|
func (*Time) ProtoMessage() {}
|
||||||
|
func (*Time) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
|
||||||
|
|
||||||
|
func (m *Time) GetTime() int64 {
|
||||||
|
if m != nil && m.Time != nil {
|
||||||
|
return *m.Time
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
proto.RegisterType((*Time)(nil), "Time")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reference imports to suppress errors if they are not otherwise used.
|
||||||
|
var _ context.Context
|
||||||
|
var _ grpc.ClientConn
|
||||||
|
|
||||||
|
// This is a compile-time assertion to ensure that this generated file
|
||||||
|
// is compatible with the grpc package it is being compiled against.
|
||||||
|
const _ = grpc.SupportPackageIsVersion3
|
||||||
|
|
||||||
|
// Client API for Chiller service
|
||||||
|
|
||||||
|
type ChillerClient interface {
|
||||||
|
// Sleep for the given amount of time, and return the amount of time slept.
|
||||||
|
Chill(ctx context.Context, in *Time, opts ...grpc.CallOption) (*Time, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type chillerClient struct {
|
||||||
|
cc *grpc.ClientConn
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewChillerClient(cc *grpc.ClientConn) ChillerClient {
|
||||||
|
return &chillerClient{cc}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *chillerClient) Chill(ctx context.Context, in *Time, opts ...grpc.CallOption) (*Time, error) {
|
||||||
|
out := new(Time)
|
||||||
|
err := grpc.Invoke(ctx, "/Chiller/Chill", in, out, c.cc, opts...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Server API for Chiller service
|
||||||
|
|
||||||
|
type ChillerServer interface {
|
||||||
|
// Sleep for the given amount of time, and return the amount of time slept.
|
||||||
|
Chill(context.Context, *Time) (*Time, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func RegisterChillerServer(s *grpc.Server, srv ChillerServer) {
|
||||||
|
s.RegisterService(&_Chiller_serviceDesc, srv)
|
||||||
|
}
|
||||||
|
|
||||||
|
func _Chiller_Chill_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
|
||||||
|
in := new(Time)
|
||||||
|
if err := dec(in); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if interceptor == nil {
|
||||||
|
return srv.(ChillerServer).Chill(ctx, in)
|
||||||
|
}
|
||||||
|
info := &grpc.UnaryServerInfo{
|
||||||
|
Server: srv,
|
||||||
|
FullMethod: "/Chiller/Chill",
|
||||||
|
}
|
||||||
|
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
|
||||||
|
return srv.(ChillerServer).Chill(ctx, req.(*Time))
|
||||||
|
}
|
||||||
|
return interceptor(ctx, in, info, handler)
|
||||||
|
}
|
||||||
|
|
||||||
|
var _Chiller_serviceDesc = grpc.ServiceDesc{
|
||||||
|
ServiceName: "Chiller",
|
||||||
|
HandlerType: (*ChillerServer)(nil),
|
||||||
|
Methods: []grpc.MethodDesc{
|
||||||
|
{
|
||||||
|
MethodName: "Chill",
|
||||||
|
Handler: _Chiller_Chill_Handler,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Streams: []grpc.StreamDesc{},
|
||||||
|
Metadata: fileDescriptor0,
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() { proto.RegisterFile("grpc/test_proto/interceptors_test.proto", fileDescriptor0) }
|
||||||
|
|
||||||
|
var fileDescriptor0 = []byte{
|
||||||
|
// 114 bytes of a gzipped FileDescriptorProto
|
||||||
|
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0xe2, 0x52, 0x4f, 0x2f, 0x2a, 0x48,
|
||||||
|
0xd6, 0x2f, 0x49, 0x2d, 0x2e, 0x89, 0x2f, 0x28, 0xca, 0x2f, 0xc9, 0xd7, 0xcf, 0xcc, 0x2b, 0x49,
|
||||||
|
0x2d, 0x4a, 0x4e, 0x2d, 0x28, 0xc9, 0x2f, 0x2a, 0x8e, 0x07, 0x89, 0xeb, 0x81, 0xc5, 0x95, 0x44,
|
||||||
|
0xb8, 0x58, 0x42, 0x32, 0x73, 0x53, 0x85, 0x78, 0xb8, 0x58, 0x4a, 0x32, 0x73, 0x53, 0x25, 0x18,
|
||||||
|
0x15, 0x18, 0x35, 0x98, 0x8d, 0x94, 0xb8, 0xd8, 0x9d, 0x33, 0x32, 0x73, 0x72, 0x52, 0x8b, 0x84,
|
||||||
|
0xc4, 0xb9, 0x58, 0xc1, 0x4c, 0x21, 0x56, 0x3d, 0x90, 0x42, 0x29, 0x08, 0xa5, 0xc4, 0xe0, 0xc4,
|
||||||
|
0x13, 0xc5, 0x85, 0x30, 0x1f, 0x10, 0x00, 0x00, 0xff, 0xff, 0x39, 0x6f, 0xd5, 0x15, 0x71, 0x00,
|
||||||
|
0x00, 0x00,
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
syntax = "proto2";
|
||||||
|
|
||||||
|
option go_package = "test_proto";
|
||||||
|
|
||||||
|
service Chiller {
|
||||||
|
// Sleep for the given amount of time, and return the amount of time slept.
|
||||||
|
rpc Chill(Time) returns (Time) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message Time {
|
||||||
|
optional int64 time = 1; // In nanoseconds
|
||||||
|
}
|
||||||
|
|
@ -536,6 +536,31 @@ func NewAmqpRPCClient(
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure that the queue we want to write to exists on the AMQP server. This
|
||||||
|
// avoids some test flakiness depending on the start order of the Boulder
|
||||||
|
// components (https://github.com/letsencrypt/boulder/issues/2294)
|
||||||
|
_, err = rpc.connection.channel.QueueDeclare(
|
||||||
|
rpcConf.Server,
|
||||||
|
AmqpDurable,
|
||||||
|
AmqpDeleteUnused,
|
||||||
|
AmqpExclusive,
|
||||||
|
AmqpNoWait,
|
||||||
|
nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("declaring queue %s: %s", rpcConf.Server, err)
|
||||||
|
}
|
||||||
|
// Make sure that messages with the appropriate routing key get routed to the
|
||||||
|
// server's queue.
|
||||||
|
err = rpc.connection.channel.QueueBind(
|
||||||
|
rpcConf.Server,
|
||||||
|
rpcConf.Server,
|
||||||
|
AmqpExchange,
|
||||||
|
false,
|
||||||
|
nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("binding routing key %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,6 @@ def start(race_detection):
|
||||||
global processes
|
global processes
|
||||||
forward()
|
forward()
|
||||||
progs = [
|
progs = [
|
||||||
'boulder-sa --config %s' % os.path.join(default_config_dir, "sa.json"),
|
|
||||||
'boulder-wfe --config %s' % os.path.join(default_config_dir, "wfe.json"),
|
'boulder-wfe --config %s' % os.path.join(default_config_dir, "wfe.json"),
|
||||||
'boulder-ra --config %s' % os.path.join(default_config_dir, "ra.json"),
|
'boulder-ra --config %s' % os.path.join(default_config_dir, "ra.json"),
|
||||||
'boulder-ca --config %s' % os.path.join(default_config_dir, "ca.json"),
|
'boulder-ca --config %s' % os.path.join(default_config_dir, "ca.json"),
|
||||||
|
|
@ -56,7 +55,8 @@ def start(race_detection):
|
||||||
'ocsp-responder --config %s' % os.path.join(default_config_dir, "ocsp-responder.json"),
|
'ocsp-responder --config %s' % os.path.join(default_config_dir, "ocsp-responder.json"),
|
||||||
'ct-test-srv',
|
'ct-test-srv',
|
||||||
'dns-test-srv',
|
'dns-test-srv',
|
||||||
'mail-test-srv --closeFirst 5'
|
'mail-test-srv --closeFirst 5',
|
||||||
|
'boulder-sa --config %s' % os.path.join(default_config_dir, "sa.json")
|
||||||
]
|
]
|
||||||
if not install(race_detection):
|
if not install(race_detection):
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue