From 704ed00a4922a2edc0ccea969f3ad68e67911875 Mon Sep 17 00:00:00 2001 From: Tarun Pothulapati Date: Fri, 5 Feb 2021 11:26:44 +0530 Subject: [PATCH] viz: make checks aware of prom and grafana being optional (#5627) * viz: make checks aware of prom and grafana being optional Fixes #5618 Currently, The linkerd-viz checks fail whenever external Prometheus is being used as those checks are not aware of Prometheus and grafana being optional. This commit fixes this by making the Prometheus and Grafana as separate checks which are not fatal and these checks can also be made dynamic and be ran only if those components are available. This commit also adds some of the missing resources checks, especially that of the new `metrics-api` into viz checks Signed-off-by: Tarun Pothulapati --- pkg/healthcheck/healthcheck.go | 3 +- test/integration/testdata/check.viz.golden | 3 +- .../linkerd-viz/templates/metrics-api.yaml | 4 +- viz/pkg/healthcheck/healthcheck.go | 73 ++++++++++++++++--- 4 files changed, 70 insertions(+), 13 deletions(-) diff --git a/pkg/healthcheck/healthcheck.go b/pkg/healthcheck/healthcheck.go index 4dcf31a42..065c27207 100644 --- a/pkg/healthcheck/healthcheck.go +++ b/pkg/healthcheck/healthcheck.go @@ -2538,7 +2538,8 @@ func CheckForPods(pods []corev1.Pod, deployNames []string) error { for _, pod := range pods { // Strip randomized suffix and take the deployment name - deployName := strings.Join(strings.Split(pod.Name, "-")[:2], "-") + parts := strings.Split(pod.Name, "-") + deployName := strings.Join(parts[:len(parts)-2], "-") exists[deployName] = true } diff --git a/test/integration/testdata/check.viz.golden b/test/integration/testdata/check.viz.golden index 4947b1663..75ef2fa88 100644 --- a/test/integration/testdata/check.viz.golden +++ b/test/integration/testdata/check.viz.golden @@ -18,12 +18,13 @@ linkerd-viz √ linkerd-viz Namespace exists √ linkerd-viz ClusterRoles exist √ linkerd-viz ClusterRoleBindings exist -√ linkerd-viz ConfigMaps exist √ tap API server has valid cert √ tap API server cert is valid for at least 60 days √ tap API service is running √ linkerd-viz pods are injected √ viz extension pods are running +√ prometheus is installed and configured correctly +√ grafana is installed and configured correctly √ can initialize the client √ viz extension self-check √ [kubernetes] linkerd viz can talk to Kubernetes diff --git a/viz/charts/linkerd-viz/templates/metrics-api.yaml b/viz/charts/linkerd-viz/templates/metrics-api.yaml index 172cd5e91..5d70980d5 100644 --- a/viz/charts/linkerd-viz/templates/metrics-api.yaml +++ b/viz/charts/linkerd-viz/templates/metrics-api.yaml @@ -72,8 +72,10 @@ spec: - -cluster-domain={{.Values.clusterDomain}} {{- if .Values.prometheusUrl }} - -prometheus-url={{.Values.prometheusUrl}} + {{- else if .Values.prometheus.enabled }} + - -prometheus-url=http://linkerd-prometheus.{{.Values.namespace}}.svc.{{.Values.clusterDomain}}:9090 {{- else }} - - -prometheus-url=http://linkerd-prometheus.linkerd-viz.svc.{{.Values.clusterDomain}}:9090 + {{ fail "Please enable `linkerd-prometheus` or provide `prometheusUrl` for the viz extension to function properly"}} {{- end }} image: {{.Values.metricsAPI.image.registry}}/{{.Values.metricsAPI.image.name}}:{{.Values.metricsAPI.image.tag}} imagePullPolicy: {{.Values.metricsAPI.pullPolicy}} diff --git a/viz/pkg/healthcheck/healthcheck.go b/viz/pkg/healthcheck/healthcheck.go index 4ad2f79b9..1ad001a56 100644 --- a/viz/pkg/healthcheck/healthcheck.go +++ b/viz/pkg/healthcheck/healthcheck.go @@ -82,21 +82,14 @@ func (hc *HealthChecker) VizCategory() healthcheck.Category { Fatal(). Warning(). WithCheck(func(ctx context.Context) error { - return healthcheck.CheckClusterRoles(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace)}, "") + return healthcheck.CheckClusterRoles(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace), fmt.Sprintf("linkerd-%s-metrics-api", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap-admin", hc.vizNamespace), "linkerd-tap-injector"}, "") }), *healthcheck.NewChecker("linkerd-viz ClusterRoleBindings exist"). WithHintAnchor("l5d-viz-crb-exists"). Fatal(). Warning(). WithCheck(func(ctx context.Context) error { - return healthcheck.CheckClusterRoleBindings(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace)}, "") - }), - *healthcheck.NewChecker("linkerd-viz ConfigMaps exist"). - WithHintAnchor("l5d-viz-cm-exists"). - Fatal(). - Warning(). - WithCheck(func(ctx context.Context) error { - return healthcheck.CheckConfigMaps(ctx, hc.KubeAPIClient(), hc.vizNamespace, true, []string{"linkerd-prometheus-config", "linkerd-grafana-config"}, "") + return healthcheck.CheckClusterRoleBindings(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-tap", hc.vizNamespace), fmt.Sprintf("linkerd-%s-metrics-api", hc.vizNamespace), fmt.Sprintf("linkerd-%s-tap-auth-delegator", hc.vizNamespace), "linkerd-tap-injector"}, "") }), *healthcheck.NewChecker("tap API server has valid cert"). WithHintAnchor("l5d-tap-cert-valid"). @@ -159,13 +152,73 @@ func (hc *HealthChecker) VizCategory() healthcheck.Category { } // Check for relevant pods to be present - err = healthcheck.CheckForPods(pods, []string{"linkerd-grafana", "linkerd-prometheus", "linkerd-web", "linkerd-tap"}) + err = healthcheck.CheckForPods(pods, []string{"linkerd-web", "linkerd-tap", "linkerd-metrics-api", "tap-injector"}) if err != nil { return err } return healthcheck.CheckPodsRunning(pods, "") }), + *healthcheck.NewChecker("prometheus is installed and configured correctly"). + WithHintAnchor("l5d-viz-prometheus"). + Warning(). + WithCheck(func(ctx context.Context) error { + // TODO: Skip if prometheus is disabled + // Check for ClusterRoles + err := healthcheck.CheckClusterRoles(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace)}, "") + if err != nil { + return err + } + + // Check for ClusterRoleBindings + err = healthcheck.CheckClusterRoleBindings(ctx, hc.KubeAPIClient(), true, []string{fmt.Sprintf("linkerd-%s-prometheus", hc.vizNamespace)}, "") + if err != nil { + return err + } + + // Check for ConfigMap + err = healthcheck.CheckConfigMaps(ctx, hc.KubeAPIClient(), hc.vizNamespace, true, []string{"linkerd-prometheus-config"}, "") + if err != nil { + return err + } + + // Check for relevant pods to be present + pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace) + if err != nil { + return err + } + + err = healthcheck.CheckForPods(pods, []string{"linkerd-prometheus"}) + if err != nil { + return err + } + + return nil + }), + *healthcheck.NewChecker("grafana is installed and configured correctly"). + WithHintAnchor("l5d-viz-grafana"). + Warning(). + WithCheck(func(ctx context.Context) error { + // TODO: Skip if grafana is disabled + // Check for ConfigMap + err := healthcheck.CheckConfigMaps(ctx, hc.KubeAPIClient(), hc.vizNamespace, true, []string{"linkerd-grafana-config"}, "") + if err != nil { + return err + } + + // Check for relevant pods to be present + pods, err := hc.KubeAPIClient().GetPodsByNamespace(ctx, hc.vizNamespace) + if err != nil { + return err + } + + err = healthcheck.CheckForPods(pods, []string{"linkerd-grafana"}) + if err != nil { + return err + } + + return nil + }), *healthcheck.NewChecker("can initialize the client"). WithHintAnchor("l5d-viz-existence-client"). Fatal().