grpc: Enable client-side health_v1 health checking (#8254)

- Configure all gRPC clients to check the overall serving status of each endpoint via the `grpc_health_v1` service. - Configure all gRPC servers to expose the `grpc_health_v1` service to any client permitted to access one of the server’s services. - Modify long-running, deep health checks to set and transition the overall (empty string) health status of the gRPC server in addition to the specific service they were configured for. Fixes #8227
2025-06-18 10:37:20 -04:00 · 2025-06-18 10:37:20 -04:00 · 1bfc3186c8
parent b6c5ee69ed
commit 1bfc3186c8
4 changed files with 46 additions and 13 deletions
--- a/cmd/config.go
+++ b/cmd/config.go
@ -465,7 +465,7 @@ type GRPCServerConfig struct {
 	// These service names must match the service names advertised by gRPC itself,
 	// which are identical to the names set in our gRPC .proto files prefixed by
 	// the package names set in those files (e.g. "ca.CertificateAuthority").
-	Services map[string]GRPCServiceConfig `json:"services" validate:"required,dive,required"`
+	Services map[string]*GRPCServiceConfig `json:"services" validate:"required,dive,required"`
 	// MaxConnectionAge specifies how long a connection may live before the server sends a GoAway to the
 	// client. Because gRPC connections re-resolve DNS after a connection close,
 	// this controls how long it takes before a client learns about changes to its
@ -476,10 +476,10 @@ type GRPCServerConfig struct {

 // GRPCServiceConfig contains the information needed to configure a gRPC service.
 type GRPCServiceConfig struct {
-	// PerServiceClientNames is a map of gRPC service names to client certificate
-	// SANs. The upstream listening server will reject connections from clients
-	// which do not appear in this list, and the server interceptor will reject
-	// RPC calls for this service from clients which are not listed here.
+	// ClientNames is the list of accepted gRPC client certificate SANs.
+	// Connections from clients not in this list will be rejected by the
+	// upstream listener, and RPCs from unlisted clients will be denied by the
+	// server interceptor.
 	ClientNames []string `json:"clientNames" validate:"min=1,dive,hostname,required"`
 }

--- a/grpc/client.go
+++ b/grpc/client.go
@ -14,11 +14,13 @@ import (
 	"github.com/letsencrypt/boulder/cmd"
 	bcreds "github.com/letsencrypt/boulder/grpc/creds"

-	// 'grpc/health' is imported for its init function, which causes clients to
-	// rely on the Health Service for load-balancing.
 	// 'grpc/internal/resolver/dns' is imported for its init function, which
 	// registers the SRV resolver.
 	"google.golang.org/grpc/balancer/roundrobin"
+
+	// 'grpc/health' is imported for its init function, which causes clients to
+	// rely on the Health Service for load-balancing as long as a
+	// "healthCheckConfig" is specified in the gRPC service config.
 	_ "google.golang.org/grpc/health"

 	_ "github.com/letsencrypt/boulder/grpc/internal/resolver/dns"
@ -61,7 +63,21 @@ func ClientSetup(c *cmd.GRPCClientConfig, tlsConfig *tls.Config, statsRegistry p
 	creds := bcreds.NewClientCredentials(tlsConfig.RootCAs, tlsConfig.Certificates, hostOverride)
 	return grpc.NewClient(
 		target,
-		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{"loadBalancingConfig": [{"%s":{}}]}`, roundrobin.Name)),
+		grpc.WithDefaultServiceConfig(
+			fmt.Sprintf(
+				// By setting the service name to an empty string in
+				// healthCheckConfig, we're instructing the gRPC client to query
+				// the overall health status of each server. The grpc-go health
+				// server, as constructed by health.NewServer(), unconditionally
+				// sets the overall service (e.g. "") status to SERVING. If a
+				// specific service name were set, the server would need to
+				// explicitly transition that service to SERVING; otherwise,
+				// clients would receive a NOT_FOUND status and the connection
+				// would be marked as unhealthy (TRANSIENT_FAILURE).
+				`{"healthCheckConfig": {"serviceName": ""},"loadBalancingConfig": [{"%s":{}}]}`,
+				roundrobin.Name,
+			),
+		),
 		grpc.WithTransportCredentials(creds),
 		grpc.WithChainUnaryInterceptor(unaryInterceptors...),
 		grpc.WithChainStreamInterceptor(streamInterceptors...),
--- a/grpc/server.go
+++ b/grpc/server.go
@ -6,6 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"net"
+	"slices"
 	"strings"
 	"time"

@ -123,12 +124,21 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
 	// This is the names which are allowlisted at the server level, plus the union
 	// of all names which are allowlisted for any individual service.
 	acceptedSANs := make(map[string]struct{})
+	var acceptedSANsSlice []string
 	for _, service := range sb.cfg.Services {
 		for _, name := range service.ClientNames {
 			acceptedSANs[name] = struct{}{}
+			if !slices.Contains(acceptedSANsSlice, name) {
+				acceptedSANsSlice = append(acceptedSANsSlice, name)
+			}
 		}
 	}

+	// Ensure that the health service has the same ClientNames as the other
+	// services, so that health checks can be performed by clients which are
+	// allowed to connect to the server.
+	sb.cfg.Services[healthpb.Health_ServiceDesc.ServiceName].ClientNames = acceptedSANsSlice
+
 	creds, err := bcreds.NewServerCredentials(tlsConfig, acceptedSANs)
 	if err != nil {
 		return nil, err
@ -224,8 +234,12 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R

 // initLongRunningCheck initializes a goroutine which will periodically check
 // the health of the provided service and update the health server accordingly.
+//
+// TODO(#8255): Remove the service parameter and instead rely on transitioning
+// the overall health of the server (e.g. "") instead of individual services.
 func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, service string, checkImpl func(context.Context) error) {
 	// Set the initial health status for the service.
+	sb.healthSrv.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
 	sb.healthSrv.SetServingStatus(service, healthpb.HealthCheckResponse_NOT_SERVING)

 	// check is a helper function that checks the health of the service and, if
@ -249,10 +263,13 @@ func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, servi
 		}

 		if next != healthpb.HealthCheckResponse_SERVING {
+			sb.logger.Errf("transitioning overall health from %q to %q, due to: %s", last, next, err)
 			sb.logger.Errf("transitioning health of %q from %q to %q, due to: %s", service, last, next, err)
 		} else {
+			sb.logger.Infof("transitioning overall health from %q to %q", last, next)
 			sb.logger.Infof("transitioning health of %q from %q to %q", service, last, next)
 		}
+		sb.healthSrv.SetServingStatus("", next)
 		sb.healthSrv.SetServingStatus(service, next)
 		return next
 	}
--- a/grpc/server_test.go
+++ b/grpc/server_test.go
@ -11,7 +11,7 @@ import (
 	"google.golang.org/grpc/health"
 )

-func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
+func TestServerBuilderInitLongRunningCheck(t *testing.T) {
 	t.Parallel()
 	hs := health.NewServer()
 	mockLogger := blog.NewMock()
@ -41,8 +41,8 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
 	//   - ~100ms 3rd check failed, SERVING to NOT_SERVING
 	serving := mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
 	notServing := mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
-	test.Assert(t, len(serving) == 1, "expected one serving log line")
-	test.Assert(t, len(notServing) == 1, "expected one not serving log line")
+	test.Assert(t, len(serving) == 2, "expected two serving log lines")
+	test.Assert(t, len(notServing) == 2, "expected two not serving log lines")

 	mockLogger.Clear()

@ -67,6 +67,6 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
 	//   - ~100ms 3rd check passed, NOT_SERVING to SERVING
 	serving = mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
 	notServing = mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
-	test.Assert(t, len(serving) == 2, "expected two serving log lines")
-	test.Assert(t, len(notServing) == 1, "expected one not serving log line")
+	test.Assert(t, len(serving) == 4, "expected four serving log lines")
+	test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
 }