enhancement: add `livez` endpoint

Add a `livez` endpoint to identify network outages. This helps in
restarting the binary if such as case is observed.

Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>

Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
This commit is contained in:
Pranshu Srivastava 2024-06-13 02:45:58 +05:30
parent 086af0cbaf
commit eb80c09755
No known key found for this signature in database
GPG Key ID: 63938388A4528764
9 changed files with 51 additions and 12 deletions

View File

@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service. After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
#### Healthcheck Endpoints
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
#### Limited privileges environment #### Limited privileges environment
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can: If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

View File

@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service. After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
#### Healthcheck Endpoints
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
#### Limited privileges environment #### Limited privileges environment
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can: If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

View File

@ -37,7 +37,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /healthz path: /livez
port: 8080 port: 8080
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5

View File

@ -32,7 +32,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /healthz path: /livez
port: 8080 port: 8080
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5

View File

@ -27,7 +27,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /healthz path: /livez
port: 8080 port: 8080
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5

View File

@ -26,7 +26,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /healthz path: /livez
port: 8080 port: 8080
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5

View File

@ -24,7 +24,7 @@ spec:
- image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /healthz path: /livez
port: 8080 port: 8080
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5

View File

@ -193,7 +193,7 @@
}, },
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: { livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8080, port: 8080,
path: '/healthz', path: '/livez',
} }, } },
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: { readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8081, port: 8081,

View File

@ -30,6 +30,12 @@ import (
"strings" "strings"
"time" "time"
"gopkg.in/yaml.v3"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"github.com/oklog/run" "github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/collectors"
@ -38,10 +44,6 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/version" "github.com/prometheus/common/version"
"github.com/prometheus/exporter-toolkit/web" "github.com/prometheus/exporter-toolkit/web"
"gopkg.in/yaml.v3"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"k8s.io/kube-state-metrics/v2/internal/discovery" "k8s.io/kube-state-metrics/v2/internal/discovery"
"k8s.io/kube-state-metrics/v2/internal/store" "k8s.io/kube-state-metrics/v2/internal/store"
@ -59,6 +61,7 @@ import (
const ( const (
metricsPath = "/metrics" metricsPath = "/metrics"
healthzPath = "/healthz" healthzPath = "/healthz"
livezPath = "/livez"
) )
// promLogger implements promhttp.Logger // promLogger implements promhttp.Logger
@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
WebConfigFile: &tlsConfig, WebConfigFile: &tlsConfig,
} }
metricsMux := buildMetricsServer(m, durationVec) metricsMux := buildMetricsServer(m, durationVec, kubeClient)
metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port)) metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
metricsServer := http.Server{ metricsServer := http.Server{
Handler: metricsMux, Handler: metricsMux,
@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
return mux return mux
} }
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux { func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
mux := http.NewServeMux() mux := http.NewServeMux()
// TODO: This doesn't belong into serveMetrics // TODO: This doesn't belong into serveMetrics
@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol)) mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace)) mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))
// Add metricsPath
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m)) mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
// Add livezPath
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
// Query the Kube API to make sure we are not affected by a network outage.
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
if got.Error() != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}))
// Add healthzPath // Add healthzPath
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) { mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)
@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
Address: healthzPath, Address: healthzPath,
Text: "Healthz", Text: "Healthz",
}, },
{
Address: livezPath,
Text: "Livez",
},
}, },
} }
landingPage, err := web.NewLandingPage(landingConfig) landingPage, err := web.NewLandingPage(landingConfig)