Merge pull request #330 from andyxning/add_metrics_for_kube-state-metrics

add kube-state-metrics own metrics
2018-01-03 16:45:27 +01:00 · 2018-01-03 16:45:27 +01:00 · 310ce6c4b1
parent 5006bfbe7f ffde60edbc
commit 310ce6c4b1
21 changed files with 140 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -24,6 +24,7 @@ the raw metrics.
  - [Resource group version compatibility](#resource-group-version-compatibility)
  - [Container Image](#container-image)
 - [Metrics Documentation](#metrics-documentation)
+- [Kube-state-metrics self metrics](#kube-state-metrics-self-metrics)
 - [Resource recommendation](#resource-recommendation)
 - [kube-state-metrics vs. Heaspter](#kube-state-metrics-vs-heapster)
 - [Setup](#setup)
@ -100,6 +101,14 @@ additional metrics!

 See the [`Documentation`](Documentation) directory for documentation of the exposed metrics.

+### Kube-state-metrics self metrics
+kube-state-metrics exposes its own metrics under `--telemetry-host` and `--telemetry-port`.
+
+| Metric name | Metric type | Description | Labels/tags |
+| ----------- | ----------- | ----------- | ----------- |
+| ksm_scrape_error_total   | Counter | Total scrape errors encountered when scraping a resource | `resource`=&lt;resource name&gt; |
+| ksm_resources_per_scrape | Summary | Number of resources returned per scrape | `resource`=&lt;resource name&gt; |
+
 ### Resource recommendation

 Resource usage changes with the size of the cluster. As a general rule, you should allocate
--- a/collectors/collectors.go
+++ b/collectors/collectors.go
@ -18,8 +18,26 @@ package collectors

 import (
 	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
 )

 var (
 	resyncPeriod = 5 * time.Minute
+
+	ScrapeErrorTotalMetric = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "ksm_scrape_error_total",
+			Help: "Total scrape errors encountered when scraping a resource",
+		},
+		[]string{"resource"},
+	)
+
+	ResourcesPerScrapeMetric = prometheus.NewSummaryVec(
+		prometheus.SummaryOpts{
+			Name: "ksm_resources_per_scrape",
+			Help: "Number of resources returned per scrape",
+		},
+		[]string{"resource"},
+	)
 )
--- a/collectors/componentstatus.go
+++ b/collectors/componentstatus.go
@ -75,9 +75,12 @@ func (csc *componentStatusCollector) Describe(ch chan<- *prometheus.Desc) {
 func (csc *componentStatusCollector) Collect(ch chan<- prometheus.Metric) {
 	csl, err := csc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "componentstatus"}).Inc()
 		glog.Errorf("listing component status failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "componentstatus"}).Observe(float64(len(csl.Items)))
 	for _, s := range csl.Items {
 		csc.collectComponentStatus(ch, s)
 	}
--- a/collectors/cronjob.go
+++ b/collectors/cronjob.go
@ -128,9 +128,12 @@ func (dc *cronJobCollector) Describe(ch chan<- *prometheus.Desc) {
 func (cjc *cronJobCollector) Collect(ch chan<- prometheus.Metric) {
 	cronjobs, err := cjc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "cronjob"}).Inc()
 		glog.Errorf("listing cronjobs failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "cronjob"}).Observe(float64(len(cronjobs)))
 	for _, cj := range cronjobs {
 		cjc.collectCronJob(ch, cj)
 	}
--- a/collectors/daemonset.go
+++ b/collectors/daemonset.go
@ -123,9 +123,12 @@ func (dc *daemonsetCollector) Describe(ch chan<- *prometheus.Desc) {
 func (dc *daemonsetCollector) Collect(ch chan<- prometheus.Metric) {
 	dss, err := dc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "daemonset"}).Inc()
 		glog.Errorf("listing daemonsets failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "daemonset"}).Observe(float64(len(dss)))
 	for _, d := range dss {
 		dc.collectDaemonSet(ch, d)
 	}
--- a/collectors/deployment.go
+++ b/collectors/deployment.go
@ -147,9 +147,12 @@ func (dc *deploymentCollector) Describe(ch chan<- *prometheus.Desc) {
 func (dc *deploymentCollector) Collect(ch chan<- prometheus.Metric) {
 	ds, err := dc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "deployment"}).Inc()
 		glog.Errorf("listing deployments failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "deployment"}).Observe(float64(len(ds)))
 	for _, d := range ds {
 		dc.collectDeployment(ch, d)
 	}
--- a/collectors/endpoint.go
+++ b/collectors/endpoint.go
@ -107,9 +107,12 @@ func (pc *endpointCollector) Describe(ch chan<- *prometheus.Desc) {
 func (ec *endpointCollector) Collect(ch chan<- prometheus.Metric) {
 	endpoints, err := ec.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "endpoint"}).Inc()
 		glog.Errorf("listing endpoints failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "endpoint"}).Observe(float64(len(endpoints)))
 	for _, e := range endpoints {
 		ec.collectEndpoints(ch, e)
 	}
--- a/collectors/hpa.go
+++ b/collectors/hpa.go
@ -111,9 +111,12 @@ func (hc *hpaCollector) Describe(ch chan<- *prometheus.Desc) {
 func (hc *hpaCollector) Collect(ch chan<- prometheus.Metric) {
 	hpas, err := hc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "horizontalpodautoscaler"}).Inc()
 		glog.Errorf("listing HorizontalPodAutoscalers failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "horizontalpodautoscaler"}).Observe(float64(len(hpas.Items)))
 	for _, h := range hpas.Items {
 		hc.collectHPA(ch, h)
 	}
--- a/collectors/job.go
+++ b/collectors/job.go
@ -152,9 +152,12 @@ func (dc *jobCollector) Describe(ch chan<- *prometheus.Desc) {
 func (jc *jobCollector) Collect(ch chan<- prometheus.Metric) {
 	jobs, err := jc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "job"}).Inc()
 		glog.Errorf("listing jobs failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "job"}).Observe(float64(len(jobs)))
 	for _, j := range jobs {
 		jc.collectJob(ch, j)
 	}
--- a/collectors/limitrange.go
+++ b/collectors/limitrange.go
@ -88,10 +88,12 @@ func (lrc *limitRangeCollector) Describe(ch chan<- *prometheus.Desc) {
 func (lrc *limitRangeCollector) Collect(ch chan<- prometheus.Metric) {
 	limitRangeCollector, err := lrc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "limitrange"}).Inc()
 		glog.Errorf("listing limit ranges failed: %s", err)
 		return
 	}

+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "limitrange"}).Observe(float64(len(limitRangeCollector.Items)))
 	for _, rq := range limitRangeCollector.Items {
 		lrc.collectLimitRange(ch, rq)
 	}
--- a/collectors/namespace.go
+++ b/collectors/namespace.go
@ -100,10 +100,12 @@ func (nsc *namespaceCollector) Describe(ch chan<- *prometheus.Desc) {
 func (nsc *namespaceCollector) Collect(ch chan<- prometheus.Metric) {
 	nsls, err := nsc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "namespace"}).Inc()
 		glog.Errorf("listing namespace failed: %s", err)
 		return
 	}

+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "namespace"}).Observe(float64(len(nsls)))
 	for _, rq := range nsls {
 		nsc.collectNamespace(ch, rq)
 	}
--- a/collectors/node.go
+++ b/collectors/node.go
@ -173,9 +173,12 @@ func (nc *nodeCollector) Describe(ch chan<- *prometheus.Desc) {
 func (nc *nodeCollector) Collect(ch chan<- prometheus.Metric) {
 	nodes, err := nc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "node"}).Inc()
 		glog.Errorf("listing nodes failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "node"}).Observe(float64(len(nodes.Items)))
 	for _, n := range nodes.Items {
 		nc.collectNode(ch, n)
 	}
--- a/collectors/persistentvolume.go
+++ b/collectors/persistentvolume.go
@ -86,10 +86,12 @@ func (collector *persistentVolumeCollector) Describe(ch chan<- *prometheus.Desc)
 func (collector *persistentVolumeCollector) Collect(ch chan<- prometheus.Metric) {
 	persistentVolumeCollector, err := collector.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "persistentvolume"}).Inc()
 		glog.Errorf("listing persistentVolume failed: %s", err)
 		return
 	}

+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "persistentvolume"}).Observe(float64(len(persistentVolumeCollector.Items)))
 	for _, pv := range persistentVolumeCollector.Items {
 		collector.collectPersistentVolume(ch, pv)
 	}
--- a/collectors/persistentvolumeclaim.go
+++ b/collectors/persistentvolumeclaim.go
@ -99,10 +99,12 @@ func (collector *persistentVolumeClaimCollector) Describe(ch chan<- *prometheus.
 func (collector *persistentVolumeClaimCollector) Collect(ch chan<- prometheus.Metric) {
 	persistentVolumeClaimCollector, err := collector.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "persistentvolumeclaim"}).Inc()
 		glog.Errorf("listing persistent volume claims failed: %s", err)
 		return
 	}

+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "persistentvolumeclaim"}).Observe(float64(len(persistentVolumeClaimCollector.Items)))
 	for _, pvc := range persistentVolumeClaimCollector.Items {
 		collector.collectPersistentVolumeClaim(ch, pvc)
 	}
--- a/collectors/pod.go
+++ b/collectors/pod.go
@ -233,9 +233,12 @@ func (pc *podCollector) Describe(ch chan<- *prometheus.Desc) {
 func (pc *podCollector) Collect(ch chan<- prometheus.Metric) {
 	pods, err := pc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "pod"}).Inc()
 		glog.Errorf("listing pods failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "pod"}).Observe(float64(len(pods)))
 	for _, p := range pods {
 		pc.collectPod(ch, p)
 	}
--- a/collectors/replicaset.go
+++ b/collectors/replicaset.go
@ -111,9 +111,12 @@ func (dc *replicasetCollector) Describe(ch chan<- *prometheus.Desc) {
 func (dc *replicasetCollector) Collect(ch chan<- prometheus.Metric) {
 	rss, err := dc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "replicaset"}).Inc()
 		glog.Errorf("listing replicasets failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "replicaset"}).Observe(float64(len(rss)))
 	for _, d := range rss {
 		dc.collectReplicaSet(ch, d)
 	}
--- a/collectors/replicationcontroller.go
+++ b/collectors/replicationcontroller.go
@ -117,9 +117,12 @@ func (dc *replicationcontrollerCollector) Describe(ch chan<- *prometheus.Desc) {
 func (dc *replicationcontrollerCollector) Collect(ch chan<- prometheus.Metric) {
 	rcs, err := dc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "replicationcontroller"}).Inc()
 		glog.Errorf("listing replicationcontrollers failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "replicationcontroller"}).Observe(float64(len(rcs)))
 	for _, d := range rcs {
 		dc.collectReplicationController(ch, d)
 	}
--- a/collectors/resourcequota.go
+++ b/collectors/resourcequota.go
@ -86,10 +86,12 @@ func (rqc *resourceQuotaCollector) Describe(ch chan<- *prometheus.Desc) {
 func (rqc *resourceQuotaCollector) Collect(ch chan<- prometheus.Metric) {
 	resourceQuota, err := rqc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "resourcequota"}).Inc()
 		glog.Errorf("listing resource quotas failed: %s", err)
 		return
 	}

+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "resourcequota"}).Observe(float64(len(resourceQuota.Items)))
 	for _, rq := range resourceQuota.Items {
 		rqc.collectResourceQuota(ch, rq)
 	}
--- a/collectors/service.go
+++ b/collectors/service.go
@ -100,9 +100,12 @@ func (pc *serviceCollector) Describe(ch chan<- *prometheus.Desc) {
 func (sc *serviceCollector) Collect(ch chan<- prometheus.Metric) {
 	services, err := sc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "service"}).Inc()
 		glog.Errorf("listing services failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "service"}).Observe(float64(len(services)))
 	for _, s := range services {
 		sc.collectService(ch, s)
 	}
--- a/collectors/statefulset.go
+++ b/collectors/statefulset.go
@ -134,9 +134,12 @@ func (dc *statefulSetCollector) Describe(ch chan<- *prometheus.Desc) {
 func (sc *statefulSetCollector) Collect(ch chan<- prometheus.Metric) {
 	sss, err := sc.store.List()
 	if err != nil {
+		ScrapeErrorTotalMetric.With(prometheus.Labels{"resource": "statefulset"}).Inc()
 		glog.Errorf("listing statefulsets failed: %s", err)
 		return
 	}
+
+	ResourcesPerScrapeMetric.With(prometheus.Labels{"resource": "statefulset"}).Observe(float64(len(sss)))
 	for _, d := range sss {
 		sc.collectStatefulSet(ch, d)
 	}
--- a/main.go
+++ b/main.go
@ -38,7 +38,7 @@ import (
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"

-	"k8s.io/kube-state-metrics/collectors"
+	kcollectors "k8s.io/kube-state-metrics/collectors"
 	"k8s.io/kube-state-metrics/version"
 )

@ -69,24 +69,24 @@ var (
 		"endpoints":                struct{}{},
 	}
 	availableCollectors = map[string]func(registry prometheus.Registerer, kubeClient clientset.Interface, namespace string){
-		"componentstatuses":        collectors.RegisterComponentStatusCollector,
-		"cronjobs":                 collectors.RegisterCronJobCollector,
-		"daemonsets":               collectors.RegisterDaemonSetCollector,
-		"deployments":              collectors.RegisterDeploymentCollector,
-		"jobs":                     collectors.RegisterJobCollector,
-		"limitranges":              collectors.RegisterLimitRangeCollector,
-		"nodes":                    collectors.RegisterNodeCollector,
-		"pods":                     collectors.RegisterPodCollector,
-		"replicasets":              collectors.RegisterReplicaSetCollector,
-		"replicationcontrollers":   collectors.RegisterReplicationControllerCollector,
-		"resourcequotas":           collectors.RegisterResourceQuotaCollector,
-		"services":                 collectors.RegisterServiceCollector,
-		"statefulsets":             collectors.RegisterStatefulSetCollector,
-		"persistentvolumes":        collectors.RegisterPersistentVolumeCollector,
-		"persistentvolumeclaims":   collectors.RegisterPersistentVolumeClaimCollector,
-		"namespaces":               collectors.RegisterNamespaceCollector,
-		"horizontalpodautoscalers": collectors.RegisterHorizontalPodAutoScalerCollector,
-		"endpoints":                collectors.RegisterEndpointCollector,
+		"componentstatuses":        kcollectors.RegisterComponentStatusCollector,
+		"cronjobs":                 kcollectors.RegisterCronJobCollector,
+		"daemonsets":               kcollectors.RegisterDaemonSetCollector,
+		"deployments":              kcollectors.RegisterDeploymentCollector,
+		"jobs":                     kcollectors.RegisterJobCollector,
+		"limitranges":              kcollectors.RegisterLimitRangeCollector,
+		"nodes":                    kcollectors.RegisterNodeCollector,
+		"pods":                     kcollectors.RegisterPodCollector,
+		"replicasets":              kcollectors.RegisterReplicaSetCollector,
+		"replicationcontrollers":   kcollectors.RegisterReplicationControllerCollector,
+		"resourcequotas":           kcollectors.RegisterResourceQuotaCollector,
+		"services":                 kcollectors.RegisterServiceCollector,
+		"statefulsets":             kcollectors.RegisterStatefulSetCollector,
+		"persistentvolumes":        kcollectors.RegisterPersistentVolumeCollector,
+		"persistentvolumeclaims":   kcollectors.RegisterPersistentVolumeClaimCollector,
+		"namespaces":               kcollectors.RegisterNamespaceCollector,
+		"horizontalpodautoscalers": kcollectors.RegisterHorizontalPodAutoScalerCollector,
+		"endpoints":                kcollectors.RegisterEndpointCollector,
 	}
 )

@ -129,15 +129,17 @@ func (c *collectorSet) Type() string {
 }

 type options struct {
-	inCluster  bool
-	apiserver  string
-	kubeconfig string
-	help       bool
-	port       int
-	host       string
-	collectors collectorSet
-	namespace  string
-	version    bool
+	inCluster     bool
+	apiserver     string
+	kubeconfig    string
+	help          bool
+	port          int
+	host          string
+	telemetryPort int
+	telemetryHost string
+	collectors    collectorSet
+	namespace     string
+	version       bool
 }

 func main() {
@ -154,6 +156,8 @@ func main() {
 	flags.BoolVarP(&options.help, "help", "h", false, "Print help text")
 	flags.IntVar(&options.port, "port", 80, `Port to expose metrics on.`)
 	flags.StringVar(&options.host, "host", "0.0.0.0", `Host to expose metrics on.`)
+	flags.IntVar(&options.telemetryPort, "telemetry-port", 81, `Port to expose kube-state-metrics self metrics on.`)
+	flags.StringVar(&options.telemetryHost, "telemetry-host", "0.0.0.0", `Host to expose kube-state-metrics self metrics on.`)
 	flags.Var(&options.collectors, "collectors", fmt.Sprintf("Comma-separated list of collectors to be enabled. Defaults to %q", &defaultCollectors))
 	flags.StringVar(&options.namespace, "namespace", metav1.NamespaceAll, "namespace to be enabled for collecting resources")
 	flags.BoolVarP(&options.version, "version", "", false, "kube-state-metrics build version information")
@ -206,6 +210,13 @@ func main() {
 		glog.Fatalf("Failed to create client: %v", err)
 	}

+	ksmMetricsRegistry := prometheus.NewRegistry()
+	ksmMetricsRegistry.Register(kcollectors.ResourcesPerScrapeMetric)
+	ksmMetricsRegistry.Register(kcollectors.ScrapeErrorTotalMetric)
+	ksmMetricsRegistry.Register(prometheus.NewProcessCollector(os.Getpid(), ""))
+	ksmMetricsRegistry.Register(prometheus.NewGoCollector())
+	go telemetryServer(ksmMetricsRegistry, options.telemetryHost, options.telemetryPort)
+
 	registry := prometheus.NewRegistry()
 	registerCollectors(registry, kubeClient, collectors, options.namespace)
 	metricsServer(registry, options.host, options.port)
@ -273,6 +284,31 @@ func createKubeClient(inCluster bool, apiserver string, kubeconfig string) (kube
 	return kubeClient, nil
 }

+func telemetryServer(registry prometheus.Gatherer, host string, port int) {
+	// Address to listen on for web interface and telemetry
+	listenAddress := net.JoinHostPort(host, strconv.Itoa(port))
+
+	glog.Infof("Starting kube-state-metrics self metrics server: %s", listenAddress)
+
+	mux := http.NewServeMux()
+
+	// Add metricsPath
+	mux.Handle(metricsPath, promhttp.HandlerFor(registry, promhttp.HandlerOpts{}))
+	// Add index
+	mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte(`<html>
+             <head><title>Kube-State-Metrics Metrics Server</title></head>
+             <body>
+             <h1>Kube-State-Metrics Metrics</h1>
+			 <ul>
+             <li><a href='` + metricsPath + `'>metrics</a></li>
+			 </ul>
+             </body>
+             </html>`))
+	})
+	log.Fatal(http.ListenAndServe(listenAddress, mux))
+}
+
 func metricsServer(registry prometheus.Gatherer, host string, port int) {
 	// Address to listen on for web interface and telemetry
 	listenAddress := net.JoinHostPort(host, strconv.Itoa(port))