grpc: Enable client-side health_v1 health checking (#8254)

- Configure all gRPC clients to check the overall serving status of each
endpoint via the `grpc_health_v1` service.
- Configure all gRPC servers to expose the `grpc_health_v1` service to
any client permitted to access one of the server’s services.
- Modify long-running, deep health checks to set and transition the
overall (empty string) health status of the gRPC server in addition to
the specific service they were configured for.

Fixes #8227
This commit is contained in:
Samantha Frank 2025-06-18 10:37:20 -04:00 committed by GitHub
parent b6c5ee69ed
commit 1bfc3186c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 13 deletions

View File

@ -465,7 +465,7 @@ type GRPCServerConfig struct {
// These service names must match the service names advertised by gRPC itself,
// which are identical to the names set in our gRPC .proto files prefixed by
// the package names set in those files (e.g. "ca.CertificateAuthority").
Services map[string]GRPCServiceConfig `json:"services" validate:"required,dive,required"`
Services map[string]*GRPCServiceConfig `json:"services" validate:"required,dive,required"`
// MaxConnectionAge specifies how long a connection may live before the server sends a GoAway to the
// client. Because gRPC connections re-resolve DNS after a connection close,
// this controls how long it takes before a client learns about changes to its
@ -476,10 +476,10 @@ type GRPCServerConfig struct {
// GRPCServiceConfig contains the information needed to configure a gRPC service.
type GRPCServiceConfig struct {
// PerServiceClientNames is a map of gRPC service names to client certificate
// SANs. The upstream listening server will reject connections from clients
// which do not appear in this list, and the server interceptor will reject
// RPC calls for this service from clients which are not listed here.
// ClientNames is the list of accepted gRPC client certificate SANs.
// Connections from clients not in this list will be rejected by the
// upstream listener, and RPCs from unlisted clients will be denied by the
// server interceptor.
ClientNames []string `json:"clientNames" validate:"min=1,dive,hostname,required"`
}

View File

@ -14,11 +14,13 @@ import (
"github.com/letsencrypt/boulder/cmd"
bcreds "github.com/letsencrypt/boulder/grpc/creds"
// 'grpc/health' is imported for its init function, which causes clients to
// rely on the Health Service for load-balancing.
// 'grpc/internal/resolver/dns' is imported for its init function, which
// registers the SRV resolver.
"google.golang.org/grpc/balancer/roundrobin"
// 'grpc/health' is imported for its init function, which causes clients to
// rely on the Health Service for load-balancing as long as a
// "healthCheckConfig" is specified in the gRPC service config.
_ "google.golang.org/grpc/health"
_ "github.com/letsencrypt/boulder/grpc/internal/resolver/dns"
@ -61,7 +63,21 @@ func ClientSetup(c *cmd.GRPCClientConfig, tlsConfig *tls.Config, statsRegistry p
creds := bcreds.NewClientCredentials(tlsConfig.RootCAs, tlsConfig.Certificates, hostOverride)
return grpc.NewClient(
target,
grpc.WithDefaultServiceConfig(fmt.Sprintf(`{"loadBalancingConfig": [{"%s":{}}]}`, roundrobin.Name)),
grpc.WithDefaultServiceConfig(
fmt.Sprintf(
// By setting the service name to an empty string in
// healthCheckConfig, we're instructing the gRPC client to query
// the overall health status of each server. The grpc-go health
// server, as constructed by health.NewServer(), unconditionally
// sets the overall service (e.g. "") status to SERVING. If a
// specific service name were set, the server would need to
// explicitly transition that service to SERVING; otherwise,
// clients would receive a NOT_FOUND status and the connection
// would be marked as unhealthy (TRANSIENT_FAILURE).
`{"healthCheckConfig": {"serviceName": ""},"loadBalancingConfig": [{"%s":{}}]}`,
roundrobin.Name,
),
),
grpc.WithTransportCredentials(creds),
grpc.WithChainUnaryInterceptor(unaryInterceptors...),
grpc.WithChainStreamInterceptor(streamInterceptors...),

View File

@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"net"
"slices"
"strings"
"time"
@ -123,12 +124,21 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
// This is the names which are allowlisted at the server level, plus the union
// of all names which are allowlisted for any individual service.
acceptedSANs := make(map[string]struct{})
var acceptedSANsSlice []string
for _, service := range sb.cfg.Services {
for _, name := range service.ClientNames {
acceptedSANs[name] = struct{}{}
if !slices.Contains(acceptedSANsSlice, name) {
acceptedSANsSlice = append(acceptedSANsSlice, name)
}
}
}
// Ensure that the health service has the same ClientNames as the other
// services, so that health checks can be performed by clients which are
// allowed to connect to the server.
sb.cfg.Services[healthpb.Health_ServiceDesc.ServiceName].ClientNames = acceptedSANsSlice
creds, err := bcreds.NewServerCredentials(tlsConfig, acceptedSANs)
if err != nil {
return nil, err
@ -224,8 +234,12 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
// initLongRunningCheck initializes a goroutine which will periodically check
// the health of the provided service and update the health server accordingly.
//
// TODO(#8255): Remove the service parameter and instead rely on transitioning
// the overall health of the server (e.g. "") instead of individual services.
func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, service string, checkImpl func(context.Context) error) {
// Set the initial health status for the service.
sb.healthSrv.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
sb.healthSrv.SetServingStatus(service, healthpb.HealthCheckResponse_NOT_SERVING)
// check is a helper function that checks the health of the service and, if
@ -249,10 +263,13 @@ func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, servi
}
if next != healthpb.HealthCheckResponse_SERVING {
sb.logger.Errf("transitioning overall health from %q to %q, due to: %s", last, next, err)
sb.logger.Errf("transitioning health of %q from %q to %q, due to: %s", service, last, next, err)
} else {
sb.logger.Infof("transitioning overall health from %q to %q", last, next)
sb.logger.Infof("transitioning health of %q from %q to %q", service, last, next)
}
sb.healthSrv.SetServingStatus("", next)
sb.healthSrv.SetServingStatus(service, next)
return next
}

View File

@ -11,7 +11,7 @@ import (
"google.golang.org/grpc/health"
)
func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
func TestServerBuilderInitLongRunningCheck(t *testing.T) {
t.Parallel()
hs := health.NewServer()
mockLogger := blog.NewMock()
@ -41,8 +41,8 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
// - ~100ms 3rd check failed, SERVING to NOT_SERVING
serving := mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
notServing := mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
test.Assert(t, len(serving) == 1, "expected one serving log line")
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
test.Assert(t, len(serving) == 2, "expected two serving log lines")
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
mockLogger.Clear()
@ -67,6 +67,6 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
// - ~100ms 3rd check passed, NOT_SERVING to SERVING
serving = mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
notServing = mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
test.Assert(t, len(serving) == 2, "expected two serving log lines")
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
test.Assert(t, len(serving) == 4, "expected four serving log lines")
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
}