grpc: Enable client-side health_v1 health checking (#8254)
- Configure all gRPC clients to check the overall serving status of each endpoint via the `grpc_health_v1` service. - Configure all gRPC servers to expose the `grpc_health_v1` service to any client permitted to access one of the server’s services. - Modify long-running, deep health checks to set and transition the overall (empty string) health status of the gRPC server in addition to the specific service they were configured for. Fixes #8227
This commit is contained in:
parent
b6c5ee69ed
commit
1bfc3186c8
|
|
@ -465,7 +465,7 @@ type GRPCServerConfig struct {
|
|||
// These service names must match the service names advertised by gRPC itself,
|
||||
// which are identical to the names set in our gRPC .proto files prefixed by
|
||||
// the package names set in those files (e.g. "ca.CertificateAuthority").
|
||||
Services map[string]GRPCServiceConfig `json:"services" validate:"required,dive,required"`
|
||||
Services map[string]*GRPCServiceConfig `json:"services" validate:"required,dive,required"`
|
||||
// MaxConnectionAge specifies how long a connection may live before the server sends a GoAway to the
|
||||
// client. Because gRPC connections re-resolve DNS after a connection close,
|
||||
// this controls how long it takes before a client learns about changes to its
|
||||
|
|
@ -476,10 +476,10 @@ type GRPCServerConfig struct {
|
|||
|
||||
// GRPCServiceConfig contains the information needed to configure a gRPC service.
|
||||
type GRPCServiceConfig struct {
|
||||
// PerServiceClientNames is a map of gRPC service names to client certificate
|
||||
// SANs. The upstream listening server will reject connections from clients
|
||||
// which do not appear in this list, and the server interceptor will reject
|
||||
// RPC calls for this service from clients which are not listed here.
|
||||
// ClientNames is the list of accepted gRPC client certificate SANs.
|
||||
// Connections from clients not in this list will be rejected by the
|
||||
// upstream listener, and RPCs from unlisted clients will be denied by the
|
||||
// server interceptor.
|
||||
ClientNames []string `json:"clientNames" validate:"min=1,dive,hostname,required"`
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,11 +14,13 @@ import (
|
|||
"github.com/letsencrypt/boulder/cmd"
|
||||
bcreds "github.com/letsencrypt/boulder/grpc/creds"
|
||||
|
||||
// 'grpc/health' is imported for its init function, which causes clients to
|
||||
// rely on the Health Service for load-balancing.
|
||||
// 'grpc/internal/resolver/dns' is imported for its init function, which
|
||||
// registers the SRV resolver.
|
||||
"google.golang.org/grpc/balancer/roundrobin"
|
||||
|
||||
// 'grpc/health' is imported for its init function, which causes clients to
|
||||
// rely on the Health Service for load-balancing as long as a
|
||||
// "healthCheckConfig" is specified in the gRPC service config.
|
||||
_ "google.golang.org/grpc/health"
|
||||
|
||||
_ "github.com/letsencrypt/boulder/grpc/internal/resolver/dns"
|
||||
|
|
@ -61,7 +63,21 @@ func ClientSetup(c *cmd.GRPCClientConfig, tlsConfig *tls.Config, statsRegistry p
|
|||
creds := bcreds.NewClientCredentials(tlsConfig.RootCAs, tlsConfig.Certificates, hostOverride)
|
||||
return grpc.NewClient(
|
||||
target,
|
||||
grpc.WithDefaultServiceConfig(fmt.Sprintf(`{"loadBalancingConfig": [{"%s":{}}]}`, roundrobin.Name)),
|
||||
grpc.WithDefaultServiceConfig(
|
||||
fmt.Sprintf(
|
||||
// By setting the service name to an empty string in
|
||||
// healthCheckConfig, we're instructing the gRPC client to query
|
||||
// the overall health status of each server. The grpc-go health
|
||||
// server, as constructed by health.NewServer(), unconditionally
|
||||
// sets the overall service (e.g. "") status to SERVING. If a
|
||||
// specific service name were set, the server would need to
|
||||
// explicitly transition that service to SERVING; otherwise,
|
||||
// clients would receive a NOT_FOUND status and the connection
|
||||
// would be marked as unhealthy (TRANSIENT_FAILURE).
|
||||
`{"healthCheckConfig": {"serviceName": ""},"loadBalancingConfig": [{"%s":{}}]}`,
|
||||
roundrobin.Name,
|
||||
),
|
||||
),
|
||||
grpc.WithTransportCredentials(creds),
|
||||
grpc.WithChainUnaryInterceptor(unaryInterceptors...),
|
||||
grpc.WithChainStreamInterceptor(streamInterceptors...),
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
|
@ -123,11 +124,20 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
|
|||
// This is the names which are allowlisted at the server level, plus the union
|
||||
// of all names which are allowlisted for any individual service.
|
||||
acceptedSANs := make(map[string]struct{})
|
||||
var acceptedSANsSlice []string
|
||||
for _, service := range sb.cfg.Services {
|
||||
for _, name := range service.ClientNames {
|
||||
acceptedSANs[name] = struct{}{}
|
||||
if !slices.Contains(acceptedSANsSlice, name) {
|
||||
acceptedSANsSlice = append(acceptedSANsSlice, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the health service has the same ClientNames as the other
|
||||
// services, so that health checks can be performed by clients which are
|
||||
// allowed to connect to the server.
|
||||
sb.cfg.Services[healthpb.Health_ServiceDesc.ServiceName].ClientNames = acceptedSANsSlice
|
||||
|
||||
creds, err := bcreds.NewServerCredentials(tlsConfig, acceptedSANs)
|
||||
if err != nil {
|
||||
|
|
@ -224,8 +234,12 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
|
|||
|
||||
// initLongRunningCheck initializes a goroutine which will periodically check
|
||||
// the health of the provided service and update the health server accordingly.
|
||||
//
|
||||
// TODO(#8255): Remove the service parameter and instead rely on transitioning
|
||||
// the overall health of the server (e.g. "") instead of individual services.
|
||||
func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, service string, checkImpl func(context.Context) error) {
|
||||
// Set the initial health status for the service.
|
||||
sb.healthSrv.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
|
||||
sb.healthSrv.SetServingStatus(service, healthpb.HealthCheckResponse_NOT_SERVING)
|
||||
|
||||
// check is a helper function that checks the health of the service and, if
|
||||
|
|
@ -249,10 +263,13 @@ func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, servi
|
|||
}
|
||||
|
||||
if next != healthpb.HealthCheckResponse_SERVING {
|
||||
sb.logger.Errf("transitioning overall health from %q to %q, due to: %s", last, next, err)
|
||||
sb.logger.Errf("transitioning health of %q from %q to %q, due to: %s", service, last, next, err)
|
||||
} else {
|
||||
sb.logger.Infof("transitioning overall health from %q to %q", last, next)
|
||||
sb.logger.Infof("transitioning health of %q from %q to %q", service, last, next)
|
||||
}
|
||||
sb.healthSrv.SetServingStatus("", next)
|
||||
sb.healthSrv.SetServingStatus(service, next)
|
||||
return next
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import (
|
|||
"google.golang.org/grpc/health"
|
||||
)
|
||||
|
||||
func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
|
||||
func TestServerBuilderInitLongRunningCheck(t *testing.T) {
|
||||
t.Parallel()
|
||||
hs := health.NewServer()
|
||||
mockLogger := blog.NewMock()
|
||||
|
|
@ -41,8 +41,8 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
|
|||
// - ~100ms 3rd check failed, SERVING to NOT_SERVING
|
||||
serving := mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
|
||||
notServing := mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
|
||||
test.Assert(t, len(serving) == 1, "expected one serving log line")
|
||||
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
|
||||
test.Assert(t, len(serving) == 2, "expected two serving log lines")
|
||||
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
|
||||
|
||||
mockLogger.Clear()
|
||||
|
||||
|
|
@ -67,6 +67,6 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
|
|||
// - ~100ms 3rd check passed, NOT_SERVING to SERVING
|
||||
serving = mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
|
||||
notServing = mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
|
||||
test.Assert(t, len(serving) == 2, "expected two serving log lines")
|
||||
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
|
||||
test.Assert(t, len(serving) == 4, "expected four serving log lines")
|
||||
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue