enhancement: add `livez` endpoint

Add a `livez` endpoint to identify network outages. This helps in restarting the binary if such as case is observed. Signed-off-by: Pranshu Srivastava <rexagod@gmail.com> Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
2024-06-13 02:45:58 +05:30 · 2024-06-13 02:45:58 +05:30 · eb80c09755
parent 086af0cbaf
commit eb80c09755
9 changed files with 51 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo

 After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.

+#### Healthcheck Endpoints
+
+The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
+
+* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
+* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
+* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
+
 #### Limited privileges environment

 If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:
--- a/README.md.tpl
+++ b/README.md.tpl
@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo

 After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.

+#### Healthcheck Endpoints
+
+The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
+
+* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
+* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
+* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
+
 #### Limited privileges environment

 If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:
--- a/examples/autosharding/statefulset.yaml
+++ b/examples/autosharding/statefulset.yaml
@ -37,7 +37,7 @@ spec:
        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
        livenessProbe:
          httpGet:
-            path: /healthz
+            path: /livez
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
--- a/examples/daemonsetsharding/daemonset.yaml
+++ b/examples/daemonsetsharding/daemonset.yaml
@ -32,7 +32,7 @@ spec:
        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
        livenessProbe:
          httpGet:
-            path: /healthz
+            path: /livez
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
--- a/examples/daemonsetsharding/deployment-no-node-pods.yaml
+++ b/examples/daemonsetsharding/deployment-no-node-pods.yaml
@ -27,7 +27,7 @@ spec:
        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
        livenessProbe:
          httpGet:
-            path: /healthz
+            path: /livez
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
--- a/examples/daemonsetsharding/deployment.yaml
+++ b/examples/daemonsetsharding/deployment.yaml
@ -26,7 +26,7 @@ spec:
        image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
        livenessProbe:
          httpGet:
-            path: /healthz
+            path: /livez
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
--- a/examples/standard/deployment.yaml
+++ b/examples/standard/deployment.yaml
@ -24,7 +24,7 @@ spec:
      - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
        livenessProbe:
          httpGet:
-            path: /healthz
+            path: /livez
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
--- a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet
+++ b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet
@ -193,7 +193,7 @@
      },
      livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
        port: 8080,
-        path: '/healthz',
+        path: '/livez',
      } },
      readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
        port: 8081,
--- a/pkg/app/server.go
+++ b/pkg/app/server.go
@ -30,6 +30,12 @@ import (
 	"strings"
 	"time"

+	"gopkg.in/yaml.v3"
+	"k8s.io/client-go/kubernetes"
+	_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
+	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/klog/v2"
+
 	"github.com/oklog/run"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/collectors"
@ -38,10 +44,6 @@ import (
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/prometheus/common/version"
 	"github.com/prometheus/exporter-toolkit/web"
-	"gopkg.in/yaml.v3"
-	_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
-	"k8s.io/client-go/tools/clientcmd"
-	"k8s.io/klog/v2"

 	"k8s.io/kube-state-metrics/v2/internal/discovery"
 	"k8s.io/kube-state-metrics/v2/internal/store"
@ -59,6 +61,7 @@ import (
 const (
 	metricsPath = "/metrics"
 	healthzPath = "/healthz"
+	livezPath   = "/livez"
 )

 // promLogger implements promhttp.Logger
@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
 		WebConfigFile:      &tlsConfig,
 	}

-	metricsMux := buildMetricsServer(m, durationVec)
+	metricsMux := buildMetricsServer(m, durationVec, kubeClient)
 	metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
 	metricsServer := http.Server{
 		Handler:           metricsMux,
@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
 	return mux
 }

-func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux {
+func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
 	mux := http.NewServeMux()

 	// TODO: This doesn't belong into serveMetrics
@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
 	mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
 	mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))

+	// Add metricsPath
 	mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
+
+	// Add livezPath
+	mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+
+		// Query the Kube API to make sure we are not affected by a network outage.
+		got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
+		if got.Error() != nil {
+			w.WriteHeader(http.StatusServiceUnavailable)
+			w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte(http.StatusText(http.StatusOK)))
+	}))
+
 	// Add healthzPath
 	mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
 				Address: healthzPath,
 				Text:    "Healthz",
 			},
+			{
+				Address: livezPath,
+				Text:    "Livez",
+			},
 		},
 	}
 	landingPage, err := web.NewLandingPage(landingConfig)