mirror of https://github.com/linkerd/linkerd2.git
375 lines
12 KiB
Go
375 lines
12 KiB
Go
package healthcheck
|
|
|
|
import (
|
|
"context"
|
|
"crypto/x509"
|
|
"fmt"
|
|
"strings"
|
|
|
|
healthcheckPb "github.com/linkerd/linkerd2/controller/gen/common/healthcheck"
|
|
"github.com/linkerd/linkerd2/pkg/healthcheck"
|
|
"github.com/linkerd/linkerd2/pkg/k8s"
|
|
"github.com/linkerd/linkerd2/pkg/tls"
|
|
"github.com/linkerd/linkerd2/viz/metrics-api/client"
|
|
pb "github.com/linkerd/linkerd2/viz/metrics-api/gen/viz"
|
|
vizLabels "github.com/linkerd/linkerd2/viz/pkg/labels"
|
|
corev1 "k8s.io/api/core/v1"
|
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
apiregistrationv1client "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1"
|
|
)
|
|
|
|
const (
|
|
// LinkerdVizExtensionCheck adds checks related to the Linkerd Viz extension
|
|
LinkerdVizExtensionCheck healthcheck.CategoryID = "linkerd-viz"
|
|
|
|
// LinkerdVizExtensionDataPlaneCheck adds checks related to dataplane for the linkerd-viz extension
|
|
LinkerdVizExtensionDataPlaneCheck healthcheck.CategoryID = "linkerd-viz-data-plane"
|
|
|
|
tapTLSSecretName = "linkerd-tap-k8s-tls"
|
|
tapOldTLSSecretName = "linkerd-tap-tls"
|
|
|
|
// linkerdTapAPIServiceName is the name of the tap api service
|
|
// This key is passed to checkApiService method to check whether
|
|
// the api service is available or not
|
|
linkerdTapAPIServiceName = "v1alpha1.tap.linkerd.io"
|
|
)
|
|
|
|
// HealthChecker wraps Linkerd's main healthchecker, adding extra fields for Viz
|
|
type HealthChecker struct {
|
|
*healthcheck.HealthChecker
|
|
vizNamespace string
|
|
vizAPIClient pb.ApiClient
|
|
}
|
|
|
|
// NewHealthChecker returns an initialized HealthChecker for Viz
|
|
// The parentCheckIDs are the category IDs of the linkerd core checks that
|
|
// are to be ran together with this instance
|
|
// The returned instance does not contain any of the viz Categories and
|
|
// to be explicitly added by using hc.AppendCategories
|
|
func NewHealthChecker(parentCheckIDs []healthcheck.CategoryID, options *healthcheck.Options) *HealthChecker {
|
|
parentHC := healthcheck.NewHealthChecker(parentCheckIDs, options)
|
|
return &HealthChecker{HealthChecker: parentHC}
|
|
}
|
|
|
|
// VizAPIClient returns a fully configured Viz API client
|
|
func (hc *HealthChecker) VizAPIClient() pb.ApiClient {
|
|
return hc.vizAPIClient
|
|
}
|
|
|
|
// RunChecks implements the healthcheck.Runner interface
|
|
func (hc *HealthChecker) RunChecks(observer healthcheck.CheckObserver) bool {
|
|
return hc.HealthChecker.RunChecks(observer)
|
|
}
|
|
|
|
// VizCategory returns a healthcheck.Category containing checkers
|
|
// to verify the health of viz components
|
|
func (hc *HealthChecker) VizCategory() healthcheck.Category {
|
|
|
|
return *healthcheck.NewCategory(LinkerdVizExtensionCheck, []healthcheck.Checker{
|
|
*healthcheck.NewChecker("linkerd-viz Namespace exists").
|
|
WithHintAnchor("l5d-viz-ns-exists").
|
|
Fatal().
|
|
WithCheck(func(ctx context.Context) error {
|
|
vizNs, err := hc.KubeAPIClient().GetNamespaceWithExtensionLabel(ctx, "linkerd-viz")
|
|
if err == nil {
|
|
hc.vizNamespace = vizNs.Name
|
|
}
|
|
return err
|
|
}),
|
|
*healthcheck.NewChecker("linkerd-viz ClusterRoles exist").
|
|
WithHintAnchor("l5d-viz-cr-exists").
|
|
Fatal().
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
return healthcheck.CheckClusterRoles(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace), fmt.Sprintf("linkerd-%s-metrics-api", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap-admin", hc.vizNamespace), "linkerd-tap-injector"}, "")
|
|
}),
|
|
*healthcheck.NewChecker("linkerd-viz ClusterRoleBindings exist").
|
|
WithHintAnchor("l5d-viz-crb-exists").
|
|
Fatal().
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
return healthcheck.CheckClusterRoleBindings(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace), fmt.Sprintf("linkerd-%s-metrics-api", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap-auth-delegator", hc.vizNamespace), "linkerd-tap-injector"}, "")
|
|
}),
|
|
*healthcheck.NewChecker("tap API server has valid cert").
|
|
WithHintAnchor("l5d-tap-cert-valid").
|
|
Fatal().
|
|
WithCheck(func(ctx context.Context) error {
|
|
anchors, err := fetchTapCaBundle(ctx, hc.KubeAPIClient())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cert, err := hc.FetchCredsFromSecret(ctx, hc.vizNamespace, tapTLSSecretName)
|
|
if kerrors.IsNotFound(err) {
|
|
cert, err = hc.FetchCredsFromOldSecret(ctx, hc.vizNamespace, tapOldTLSSecretName)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
identityName := fmt.Sprintf("linkerd-tap.%s.svc", hc.vizNamespace)
|
|
return hc.CheckCertAndAnchors(cert, anchors, identityName)
|
|
}),
|
|
*healthcheck.NewChecker("tap API server cert is valid for at least 60 days").
|
|
WithHintAnchor("l5d-webhook-cert-not-expiring-soon").
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
cert, err := hc.FetchCredsFromSecret(ctx, hc.vizNamespace, tapTLSSecretName)
|
|
if kerrors.IsNotFound(err) {
|
|
cert, err = hc.FetchCredsFromOldSecret(ctx, hc.vizNamespace, tapOldTLSSecretName)
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return hc.CheckCertAndAnchorsExpiringSoon(cert)
|
|
}),
|
|
*healthcheck.NewChecker("tap API service is running").
|
|
WithHintAnchor("l5d-tap-api").
|
|
Warning().
|
|
WithRetryDeadline(hc.RetryDeadline).
|
|
WithCheck(func(ctx context.Context) error {
|
|
return hc.CheckAPIService(ctx, linkerdTapAPIServiceName)
|
|
}),
|
|
*healthcheck.NewChecker("linkerd-viz pods are injected").
|
|
WithHintAnchor("l5d-viz-pods-injection").
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return healthcheck.CheckIfDataPlanePodsExist(pods)
|
|
}),
|
|
*healthcheck.NewChecker("viz extension pods are running").
|
|
WithHintAnchor("l5d-viz-pods-running").
|
|
Warning().
|
|
WithRetryDeadline(hc.RetryDeadline).
|
|
SurfaceErrorOnRetry().
|
|
WithCheck(func(ctx context.Context) error {
|
|
pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for relevant pods to be present
|
|
err = healthcheck.CheckForPods(pods, []string{"linkerd-web", "linkerd-tap", "linkerd-metrics-api", "tap-injector"})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return healthcheck.CheckPodsRunning(pods, "")
|
|
}),
|
|
*healthcheck.NewChecker("prometheus is installed and configured correctly").
|
|
WithHintAnchor("l5d-viz-prometheus").
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
// TODO: Skip if prometheus is disabled
|
|
// Check for ClusterRoles
|
|
err := healthcheck.CheckClusterRoles(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace)}, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for ClusterRoleBindings
|
|
err = healthcheck.CheckClusterRoleBindings(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace)}, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for ConfigMap
|
|
err = healthcheck.CheckConfigMaps(ctx, hc.KubeAPIClient(), hc.vizNamespace, true, []string{"linkerd-prometheus-config"}, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for relevant pods to be present
|
|
pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = healthcheck.CheckForPods(pods, []string{"linkerd-prometheus"})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}),
|
|
*healthcheck.NewChecker("grafana is installed and configured correctly").
|
|
WithHintAnchor("l5d-viz-grafana").
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
// TODO: Skip if grafana is disabled
|
|
// Check for ConfigMap
|
|
err := healthcheck.CheckConfigMaps(ctx, hc.KubeAPIClient(), hc.vizNamespace, true, []string{"linkerd-grafana-config"}, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check for relevant pods to be present
|
|
pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = healthcheck.CheckForPods(pods, []string{"linkerd-grafana"})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}),
|
|
*healthcheck.NewChecker("can initialize the client").
|
|
WithHintAnchor("l5d-viz-existence-client").
|
|
Fatal().
|
|
WithCheck(func(ctx context.Context) (err error) {
|
|
hc.vizAPIClient, err = client.NewExternalClient(ctx, hc.vizNamespace, hc.KubeAPIClient())
|
|
return
|
|
}),
|
|
*healthcheck.NewChecker("viz extension self-check").
|
|
WithHintAnchor("l5d-api-control-api").
|
|
Fatal().
|
|
// to avoid confusing users with a prometheus readiness error, we only show
|
|
// "waiting for check to complete" while things converge. If after the timeout
|
|
// it still hasn't converged, we show the real error (a 503 usually).
|
|
WithRetryDeadline(hc.RetryDeadline).
|
|
WithCheckRPC(func(ctx context.Context) (*healthcheckPb.SelfCheckResponse, error) {
|
|
return hc.vizAPIClient.SelfCheck(ctx, &healthcheckPb.SelfCheckRequest{})
|
|
}),
|
|
}, true)
|
|
}
|
|
|
|
// VizDataPlaneCategory returns a healthcheck.Category containing checkers
|
|
// to verify the data-plane metrics in prometheus and the tap injection
|
|
func (hc *HealthChecker) VizDataPlaneCategory() healthcheck.Category {
|
|
|
|
return *healthcheck.NewCategory(LinkerdVizExtensionDataPlaneCheck, []healthcheck.Checker{
|
|
*healthcheck.NewChecker("data plane namespace exists").
|
|
WithHintAnchor("l5d-data-plane-exists").
|
|
Fatal().
|
|
WithCheck(func(ctx context.Context) error {
|
|
if hc.DataPlaneNamespace == "" {
|
|
// when checking proxies in all namespaces, this check is a no-op
|
|
return nil
|
|
}
|
|
return hc.CheckNamespace(ctx, hc.DataPlaneNamespace, true)
|
|
}),
|
|
*healthcheck.NewChecker("data plane proxy metrics are present in Prometheus").
|
|
WithHintAnchor("l5d-data-plane-prom").
|
|
Warning().
|
|
WithRetryDeadline(hc.RetryDeadline).
|
|
WithCheck(func(ctx context.Context) (err error) {
|
|
pods, err := hc.getDataPlanePodsFromVizAPI(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// TODO: Check if prometheus is present
|
|
|
|
return validateDataPlanePodReporting(pods)
|
|
}),
|
|
*healthcheck.NewChecker("data-plane pods have tap enabled").
|
|
WithHintAnchor("l5d-viz-data-plane-tap").
|
|
Warning().
|
|
WithCheck(func(ctx context.Context) error {
|
|
pods, err := hc.GetDataPlanePods(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return hc.checkForTapConfiguration(ctx, pods)
|
|
}),
|
|
}, true)
|
|
}
|
|
|
|
func (hc *HealthChecker) getDataPlanePodsFromVizAPI(ctx context.Context) ([]*pb.Pod, error) {
|
|
|
|
req := &pb.ListPodsRequest{}
|
|
if hc.DataPlaneNamespace != "" {
|
|
req.Selector = &pb.ResourceSelection{
|
|
Resource: &pb.Resource{
|
|
Namespace: hc.DataPlaneNamespace,
|
|
},
|
|
}
|
|
}
|
|
|
|
resp, err := hc.VizAPIClient().ListPods(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
pods := make([]*pb.Pod, 0)
|
|
for _, pod := range resp.GetPods() {
|
|
if pod.ControllerNamespace == hc.ControlPlaneNamespace {
|
|
pods = append(pods, pod)
|
|
}
|
|
}
|
|
|
|
return pods, nil
|
|
}
|
|
|
|
// checkForTapConfiguration checks if the tap annotation is present
|
|
// only for the pods with tap enabled
|
|
func (hc *HealthChecker) checkForTapConfiguration(ctx context.Context, pods []corev1.Pod) error {
|
|
var podsWithoutTap []string
|
|
for i := range pods {
|
|
pod := pods[i]
|
|
ns, err := hc.KubeAPIClient().CoreV1().Namespaces().Get(ctx, pod.Namespace, metav1.GetOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Check if Tap is disabled
|
|
if !k8s.IsTapDisabled(pod) && !k8s.IsTapDisabled(ns) {
|
|
// Check for tap-injector annotation
|
|
if !vizLabels.IsTapEnabled(&pod) {
|
|
podsWithoutTap = append(podsWithoutTap, fmt.Sprintf("* %s", pod.Name))
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(podsWithoutTap) > 0 {
|
|
return fmt.Errorf("Some data plane pods do not have tap configured and cannot be tapped:\n\t%s", strings.Join(podsWithoutTap, "\n\t"))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateDataPlanePodReporting(pods []*pb.Pod) error {
|
|
notInPrometheus := []string{}
|
|
|
|
for _, p := range pods {
|
|
// the `Added` field indicates the pod was found in Prometheus
|
|
if !p.Added {
|
|
notInPrometheus = append(notInPrometheus, p.Name)
|
|
}
|
|
}
|
|
|
|
errMsg := ""
|
|
if len(notInPrometheus) > 0 {
|
|
errMsg = fmt.Sprintf("Data plane metrics not found for %s.", strings.Join(notInPrometheus, ", "))
|
|
}
|
|
|
|
if errMsg != "" {
|
|
return fmt.Errorf(errMsg)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func fetchTapCaBundle(ctx context.Context, kubeAPI *k8s.KubernetesAPI) ([]*x509.Certificate, error) {
|
|
apiServiceClient, err := apiregistrationv1client.NewForConfig(kubeAPI.Config)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
apiService, err := apiServiceClient.APIServices().Get(ctx, linkerdTapAPIServiceName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
caBundle, err := tls.DecodePEMCertificates(string(apiService.Spec.CABundle))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return caBundle, nil
|
|
}
|