Simplify how gRPC services start, stop, and clean up (#6771)

The CA, RA, and VA have multiple goroutines running alongside primary
gRPC handling goroutine. These ancillary goroutines should be gracefully
shut down when the process is about to exit. Historically, we have
handled this by putting a call to each of these goroutine's shutdown
function inside cmd.CatchSignals, so that when a SIGINT is received, all
of the various cleanup routines happen in sequence.

But there's a cleaner way to do it: just use defer! All of these
cleanups need to happen after the primary gRPC server has fully shut
down, so that we know they stick around at least as long as the service
is handling gRPC requests. And when the service receives a SIGINT,
cmd.CatchSignals will call the gRPC server's GracefulStop, which will
cause the server's .Serve() to finally exit, which will cause start() to
exit, which will cause main() to exit, which will cause all deferred
functions to be run.

In addition, remove filterShutdownErrors as the bug which made it
necessary (.Serve() returning an error even when GracefulShutdown() is
called) was fixed back in 2017. This allows us to call the start()
function in a much more natural way, simply logging any error it returns
instead of calling os.Exit(1) if it returns an error.

This allows us to simplify the exit-handling code in these three
services' main() functions, and lets us be a bit more idiomatic with our
deferred cleanup functions.

Part of #6794
This commit is contained in:
Aaron Gable 2023-04-05 14:55:57 -07:00 committed by GitHub
parent 0b4d2f4a66
commit d6cd589795
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 57 additions and 78 deletions

View File

@ -400,13 +400,9 @@ func daemon(c Config, ap *akamaiPurger, logger blog.Logger, scope prometheus.Reg
stopped <- true
}()
start, stopFn, err := bgrpc.NewServer(c.AkamaiPurger.GRPC).Add(
&akamaipb.AkamaiPurger_ServiceDesc, ap).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup Akamai purger gRPC server")
go cmd.CatchSignals(logger, func() {
stopFn()
// When the gRPC server finally exits, run a clean-up routine that stops the
// ticker and waits for the goroutine above to finish purging the stack.
defer func() {
// Stop the ticker and signal that we want to shutdown by writing to the
// stop channel. We wait 15 seconds for any remaining URLs to be emptied
// from the current stack, if we pass that deadline we exit early.
@ -417,14 +413,13 @@ func daemon(c Config, ap *akamaiPurger, logger blog.Logger, scope prometheus.Reg
cmd.Fail("Timed out waiting for purger to finish work")
case <-stopped:
}
})
cmd.FailOnError(start(), "akamai-purger gRPC service failed")
}()
// When we get a SIGTERM, we will exit from grpcSrv.Serve as soon as all
// extant RPCs have been processed, but we want the process to stick around
// while we still have a goroutine purging the last elements from the stack.
// Once that's done, CatchSignals will call os.Exit().
select {}
start, err := bgrpc.NewServer(c.AkamaiPurger.GRPC).Add(
&akamaipb.AkamaiPurger_ServiceDesc, ap).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup Akamai purger gRPC server")
cmd.FailOnError(start(), "akamai-purger gRPC service failed")
}
func init() {

View File

@ -249,8 +249,8 @@ func main() {
var entries int
ecdsaAllowList, entries, err = ca.NewECDSAAllowListFromFile(c.CA.ECDSAAllowListFilename, logger, allowListGauge)
cmd.FailOnError(err, "Unable to load ECDSA allow list from YAML file")
defer ecdsaAllowList.Stop()
logger.Infof("Created a reloadable allow list, it was initialized with %d entries", entries)
}
srv := bgrpc.NewServer(c.CA.GRPCCA)
@ -272,6 +272,7 @@ func main() {
)
cmd.FailOnError(err, "Failed to create OCSP impl")
go ocspi.LogOCSPLoop()
defer ocspi.Stop()
srv = srv.Add(&capb.OCSPGenerator_ServiceDesc, ocspi)
}
@ -316,16 +317,9 @@ func main() {
srv = srv.Add(&capb.CertificateAuthority_ServiceDesc, cai)
}
start, stop, err := srv.Build(tlsConfig, scope, clk)
start, err := srv.Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup CA gRPC server")
go cmd.CatchSignals(logger, func() {
stop()
ecdsaAllowList.Stop()
if ocspi != nil {
ocspi.Stop()
}
})
cmd.FailOnError(start(), "CA gRPC service failed")
}

View File

@ -92,11 +92,10 @@ func main() {
pubi := publisher.New(bundles, c.Publisher.UserAgent, logger, scope)
start, stop, err := bgrpc.NewServer(c.Publisher.GRPC).Add(
start, err := bgrpc.NewServer(c.Publisher.GRPC).Add(
&pubpb.Publisher_ServiceDesc, pubi).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup Publisher gRPC server")
go cmd.CatchSignals(logger, stop)
cmd.FailOnError(start(), "Publisher gRPC service failed")
}

View File

@ -243,6 +243,7 @@ func main() {
apc,
issuerCerts,
)
defer rai.DrainFinalize()
policyErr := rai.SetRateLimitPoliciesFile(c.RA.RateLimitPoliciesFilename)
cmd.FailOnError(policyErr, "Couldn't load rate limit policies file")
@ -253,14 +254,10 @@ func main() {
rai.OCSP = ocspc
rai.SA = sac
start, stop, err := bgrpc.NewServer(c.RA.GRPC).Add(
start, err := bgrpc.NewServer(c.RA.GRPC).Add(
&rapb.RegistrationAuthority_ServiceDesc, rai).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup RA gRPC server")
go cmd.CatchSignals(logger, func() {
stop()
rai.DrainFinalize()
})
cmd.FailOnError(start(), "RA gRPC service failed")
}

View File

@ -93,13 +93,12 @@ func main() {
sai, err := sa.NewSQLStorageAuthorityWrapping(saroi, dbMap, scope)
cmd.FailOnError(err, "Failed to create SA impl")
start, stop, err := bgrpc.NewServer(c.SA.GRPC).Add(
start, err := bgrpc.NewServer(c.SA.GRPC).Add(
&sapb.StorageAuthorityReadOnly_ServiceDesc, saroi).Add(
&sapb.StorageAuthority_ServiceDesc, sai).Build(
tls, scope, clk)
cmd.FailOnError(err, "Unable to setup SA gRPC server")
go cmd.CatchSignals(logger, stop)
cmd.FailOnError(start(), "SA gRPC service failed")
}

View File

@ -96,6 +96,7 @@ func main() {
}
servers, err = bdns.StartDynamicProvider(c.VA.DNSResolver, 60*time.Second)
cmd.FailOnError(err, "Couldn't start dynamic DNS server resolver")
defer servers.Stop()
var resolver bdns.Client
if !(c.VA.DNSAllowLoopbackAddresses || c.Common.DNSAllowLoopbackAddresses) {
@ -147,16 +148,11 @@ func main() {
c.VA.AccountURIPrefixes)
cmd.FailOnError(err, "Unable to create VA server")
start, stop, err := bgrpc.NewServer(c.VA.GRPC).Add(
start, err := bgrpc.NewServer(c.VA.GRPC).Add(
&vapb.VA_ServiceDesc, vai).Add(
&vapb.CAA_ServiceDesc, vai).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup VA gRPC server")
go cmd.CatchSignals(logger, func() {
servers.Stop()
stop()
})
cmd.FailOnError(start(), "VA gRPC service failed")
}

View File

@ -123,11 +123,10 @@ func main() {
csi, err := storer.New(issuers, s3client, c.CRLStorer.S3Bucket, scope, logger, clk)
cmd.FailOnError(err, "Failed to create CRLStorer impl")
start, stop, err := bgrpc.NewServer(c.CRLStorer.GRPC).Add(
start, err := bgrpc.NewServer(c.CRLStorer.GRPC).Add(
&cspb.CRLStorer_ServiceDesc, csi).Build(tlsConfig, scope, clk)
cmd.FailOnError(err, "Unable to setup CRLStorer gRPC server")
go cmd.CatchSignals(logger, stop)
cmd.FailOnError(start(), "CRLStorer gRPC service failed")
}

View File

@ -109,11 +109,10 @@ func main() {
cmd.FailOnError(err, "tlsConfig config")
nonceServer := nonce.NewServer(ns)
start, stop, err := bgrpc.NewServer(c.NonceService.GRPC).Add(
start, err := bgrpc.NewServer(c.NonceService.GRPC).Add(
&noncepb.NonceService_ServiceDesc, nonceServer).Build(tlsConfig, scope, cmd.Clock())
cmd.FailOnError(err, "Unable to setup nonce service gRPC server")
go cmd.CatchSignals(logger, stop)
cmd.FailOnError(start(), "Nonce service gRPC server failed")
}

View File

@ -157,7 +157,7 @@ func (lw logWriter) Write(p []byte) (n int, err error) {
// StatsAndLogging constructs a prometheus registerer and an AuditLogger based
// on its config parameters, and return them both. It also spawns off an HTTP
// server on the provided port to report the stats and provide pprof profiling
// handlers. NewLogger and newStatsRegistry will call os.Exit on errors.
// handlers. NewLogger and newStatsRegistry will panic on errors.
// Also sets the constructed AuditLogger as the default logger, and configures
// the mysql and grpc packages to use our logger.
// This must be called before any gRPC code is called, because gRPC's SetLogger
@ -276,15 +276,18 @@ func newStatsRegistry(addr string, logger blog.Logger) prometheus.Registerer {
return registry
}
// Fail exits and prints an error message to stderr and the logger audit log.
// Fail prints a message to the audit log, then panics, causing the process to exit but
// allowing deferred cleanup functions to run on the way out.
func Fail(msg string) {
logger := blog.Get()
logger.AuditErr(msg)
os.Exit(1)
panic(msg)
}
// FailOnError exits and prints an error message, but only if we encountered
// a problem and err != nil. err is required but msg can be "".
// FailOnError prints an error message and panics, but only if the provided
// error is actually non-nil. This is useful for one-line error handling in
// top-level executables, but should generally be avoided in libraries. The
// message argument is optional.
func FailOnError(err error, msg string) {
if err == nil {
return

View File

@ -5,7 +5,10 @@ import (
"errors"
"fmt"
"net"
"os"
"os/signal"
"strings"
"syscall"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/jmhodges/clock"
@ -64,16 +67,18 @@ func (sb *serverBuilder) Add(desc *grpc.ServiceDesc, impl any) *serverBuilder {
// Build creates a gRPC server that uses the provided *tls.Config and exposes
// all of the services added to the builder. It also exposes a health check
// service. It returns two functions, start() and stop(), which should be used
// to start and gracefully stop the server.
func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.Registerer, clk clock.Clock) (func() error, func(), error) {
// service. It returns one functions, start(), which should be used to start
// the server. It spawns a goroutine which will listen for OS signals and
// gracefully stop the server if one is caught, causing the start() function to
// exit.
func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.Registerer, clk clock.Clock) (func() error, error) {
// Add the health service to all servers.
healthSrv := health.NewServer()
sb = sb.Add(&healthpb.Health_ServiceDesc, healthSrv)
// Check to see if any of the calls to .Add() resulted in an error.
if sb.err != nil {
return nil, nil, sb.err
return nil, sb.err
}
// Ensure that every configured service also got added.
@ -84,12 +89,12 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
for serviceName := range sb.cfg.Services {
_, ok := sb.services[serviceName]
if !ok {
return nil, nil, fmt.Errorf("gRPC service %q in config does not match any service: %s", serviceName, strings.Join(registeredServices, ", "))
return nil, fmt.Errorf("gRPC service %q in config does not match any service: %s", serviceName, strings.Join(registeredServices, ", "))
}
}
if tlsConfig == nil {
return nil, nil, errNilTLS
return nil, errNilTLS
}
// Collect all names which should be allowed to connect to the server at all.
@ -108,14 +113,14 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
creds, err := bcreds.NewServerCredentials(tlsConfig, acceptedSANs)
if err != nil {
return nil, nil, err
return nil, err
}
// Set up all of our interceptors which handle metrics, traces, error
// propagation, and more.
metrics, err := newServerMetrics(statsRegistry)
if err != nil {
return nil, nil, err
return nil, err
}
var ai serverInterceptor
@ -160,18 +165,27 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
// Finally return the functions which will start and stop the server.
listener, err := net.Listen("tcp", sb.cfg.Address)
if err != nil {
return nil, nil, err
return nil, err
}
start := func() error {
return filterShutdownErrors(server.Serve(listener))
}
stop := func() {
healthSrv.Shutdown()
server.GracefulStop()
return server.Serve(listener)
}
return start, stop, nil
// Start a goroutine which listens for a termination signal, and then
// gracefully stops the gRPC server. This in turn causes the start() function
// to exit, allowing its caller (generally a main() function) to exit.
go func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGTERM)
signal.Notify(sigChan, syscall.SIGINT)
signal.Notify(sigChan, syscall.SIGHUP)
<-sigChan
healthSrv.Shutdown()
server.GracefulStop()
}()
return start, nil
}
// serverMetrics is a struct type used to return a few registered metrics from
@ -222,19 +236,3 @@ func newServerMetrics(stats prometheus.Registerer) (serverMetrics, error) {
rpcLag: rpcLag,
}, nil
}
// filterShutdownErrors returns the input error, with the exception of "use of
// closed network connection," on which it returns nil
// Per https://github.com/grpc/grpc-go/issues/1017, a gRPC server's `Serve()`
// will always return an error, even when GracefulStop() is called. We don't
// want to log graceful stops as errors, so we filter out the meaningless
// error we get in that situation.
func filterShutdownErrors(err error) error {
if err == nil {
return nil
}
if strings.Contains(err.Error(), "use of closed network connection") {
return nil
}
return err
}