enhancement: add `livez` endpoint

Add a `livez` endpoint to identify network outages. This helps in
restarting the binary if such as case is observed.

Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>

Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
This commit is contained in:
Pranshu Srivastava 2024-06-13 02:45:58 +05:30
parent 086af0cbaf
commit eb80c09755
No known key found for this signature in database
GPG Key ID: 63938388A4528764
9 changed files with 51 additions and 12 deletions

View File

@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
#### Healthcheck Endpoints
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
#### Limited privileges environment
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

View File

@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo
After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.
#### Healthcheck Endpoints
The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
#### Limited privileges environment
If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:

View File

@ -37,7 +37,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5

View File

@ -32,7 +32,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5

View File

@ -27,7 +27,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5

View File

@ -26,7 +26,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5

View File

@ -24,7 +24,7 @@ spec:
- image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5

View File

@ -193,7 +193,7 @@
},
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8080,
path: '/healthz',
path: '/livez',
} },
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8081,

View File

@ -30,6 +30,12 @@ import (
"strings"
"time"
"gopkg.in/yaml.v3"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
@ -38,10 +44,6 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/version"
"github.com/prometheus/exporter-toolkit/web"
"gopkg.in/yaml.v3"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
"k8s.io/kube-state-metrics/v2/internal/discovery"
"k8s.io/kube-state-metrics/v2/internal/store"
@ -59,6 +61,7 @@ import (
const (
metricsPath = "/metrics"
healthzPath = "/healthz"
livezPath = "/livez"
)
// promLogger implements promhttp.Logger
@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
WebConfigFile: &tlsConfig,
}
metricsMux := buildMetricsServer(m, durationVec)
metricsMux := buildMetricsServer(m, durationVec, kubeClient)
metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
metricsServer := http.Server{
Handler: metricsMux,
@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
return mux
}
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux {
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
mux := http.NewServeMux()
// TODO: This doesn't belong into serveMetrics
@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))
// Add metricsPath
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))
// Add livezPath
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
// Query the Kube API to make sure we are not affected by a network outage.
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
if got.Error() != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}))
// Add healthzPath
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
Address: healthzPath,
Text: "Healthz",
},
{
Address: livezPath,
Text: "Livez",
},
},
}
landingPage, err := web.NewLandingPage(landingConfig)