Add more node metrics.

This adds metrics on the current node phase, allocatable and capacity
resources, and the out-of-disk condition
This commit is contained in:
Fabian Reinartz 2016-09-09 10:24:33 +02:00
parent a7da0baf45
commit 15d8fccf53
3 changed files with 197 additions and 21 deletions

View File

@ -28,6 +28,14 @@ additional metrics!
| ---------- | ----------- | ----------- |
| node_info | Gauge | `node`=&lt;node-address&gt; <br> `kernel_version`=&lt;kernel-version&gt; <br> `os_image`=&lt;os-image-name&gt; <br> `container_runtime_version`=&lt;container-runtime-and-version-combination&gt; <br> `kubelet_version`=&lt;kubelet-version&gt; <br> `kubeproxy_version`=&lt;kubeproxy-version&gt; |
| node_status_ready| Gauge | `node`=&lt;node-address&gt; <br> `condition`=&lt;true\|false\|unknown&gt; |
| node_status_out_of_disk | Gauge | `node`=&lt;node-address&gt; <br> `condition`=&lt;true\|false\|unknown&gt; |
| node_status_phase| Gauge | `node`=&lt;node-address&gt; <br> `phase`=&lt;Pending|Running|Terminated&gt; |
| node_status_capacity_cpu_cores | Gauge | `node`=&lt;node-address&gt;|
| node_status_capacity_memory_bytes | Gauge | `node`=&lt;node-address&gt;|
| node_status_capacity_pods | Gauge | `node`=&lt;node-address&gt;|
| node_status_allocateable_cpu_cores | Gauge | `node`=&lt;node-address&gt;|
| node_status_allocateable_memory_bytes | Gauge | `node`=&lt;node-address&gt;|
| node_status_allocateable_pods | Gauge | `node`=&lt;node-address&gt;|
| deployment_replicas | Gauge | `deployment`=&lt;deployment-name&gt; <br> `namespace`=&lt;deployment-namespace&gt; |
| deployment_replicas_available | Gauge | `deployment`=&lt;deployment-name&gt; <br> `namespace`=&lt;deployment-namespace&gt; |
| pod_container_restarts | Counter | `container`=&lt;container-name&gt; <br> `namespace`=&lt;pod-namespace&gt; <br> `pod`=&lt;pod-name&gt; |

101
node.go
View File

@ -41,6 +41,48 @@ var (
"The ready status of a cluster node.",
[]string{"node", "condition"}, nil,
)
descNodeStatusOutOfDisk = prometheus.NewDesc(
"node_status_out_of_disk",
"Whether the node is out of disk space",
[]string{"node", "condition"}, nil,
)
descNodeStatusPhase = prometheus.NewDesc(
"node_status_phase",
"The phase the node is currently in.",
[]string{"node", "phase"}, nil,
)
descNodeStatusCapacityPods = prometheus.NewDesc(
"node_status_capacity_pods",
"The total pod resources of the node.",
[]string{"node"}, nil,
)
descNodeStatusCapacityCPU = prometheus.NewDesc(
"node_status_capacity_cpu_cores",
"The total CPU resources of the node.",
[]string{"node"}, nil,
)
descNodeStatusCapacityMemory = prometheus.NewDesc(
"node_status_capacity_memory_bytes",
"The total memory resources of the node.",
[]string{"node"}, nil,
)
descNodeStatusAllocateablePods = prometheus.NewDesc(
"node_status_allocateable_pods",
"The pod resources of a node that are available for scheduling.",
[]string{"node"}, nil,
)
descNodeStatusAllocateableCPU = prometheus.NewDesc(
"node_status_allocateable_cpu_cores",
"The CPU resources of a node that are available for scheduling.",
[]string{"node"}, nil,
)
descNodeStatusAllocateableMemory = prometheus.NewDesc(
"node_status_allocateable_memory_bytes",
"The memory resources of a node that are available for scheduling.",
[]string{"node"}, nil,
)
)
type nodeStore interface {
@ -56,6 +98,14 @@ type nodeCollector struct {
func (nc *nodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- descNodeInfo
ch <- descNodeStatusReady
ch <- descNodeStatusOutOfDisk
ch <- descNodeStatusPhase
ch <- descNodeStatusCapacityCPU
ch <- descNodeStatusCapacityMemory
ch <- descNodeStatusCapacityPods
ch <- descNodeStatusAllocateableCPU
ch <- descNodeStatusAllocateableMemory
ch <- descNodeStatusAllocateablePods
}
// Collect implements the prometheus.Collector interface.
@ -71,30 +121,55 @@ func (nc *nodeCollector) Collect(ch chan<- prometheus.Metric) {
}
func (nc *nodeCollector) collectNode(ch chan<- prometheus.Metric, n api.Node) {
// Collect node conditions and while default to false.
// TODO(fabxc): add remaining conditions: NodeOutOfDisk, NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
for _, c := range n.Status.Conditions {
switch c.Type {
case api.NodeReady:
nodeStatusMetrics(ch, descNodeStatusReady, n.Name, c.Status)
}
addGauge := func(desc *prometheus.Desc, v float64, lv ...string) {
lv = append([]string{n.Name}, lv...)
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
}
// NOTE: the instrumentation API requires providing label values in order of declaration
// in the metric descriptor. Be careful when making modifications.
ch <- prometheus.MustNewConstMetric(
descNodeInfo, prometheus.GaugeValue, 1,
n.Name,
addGauge(descNodeInfo, 1,
n.Status.NodeInfo.KernelVersion,
n.Status.NodeInfo.OSImage,
n.Status.NodeInfo.ContainerRuntimeVersion,
n.Status.NodeInfo.KubeletVersion,
n.Status.NodeInfo.KubeProxyVersion,
)
// Collect node conditions and while default to false.
// TODO(fabxc): add remaining conditions: NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
for _, c := range n.Status.Conditions {
switch c.Type {
case api.NodeReady:
nodeConditionMetrics(ch, descNodeStatusReady, n.Name, c.Status)
case api.NodeOutOfDisk:
nodeConditionMetrics(ch, descNodeStatusOutOfDisk, n.Name, c.Status)
}
}
// Set current phase to 1, others to 0 if it is set.
if p := n.Status.Phase; p != "" {
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodePending), string(api.NodePending))
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodeRunning), string(api.NodeRunning))
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodeTerminated), string(api.NodeTerminated))
}
// Add capacity and allocateable resources if they are set.
addResource := func(d *prometheus.Desc, res api.ResourceList, n api.ResourceName) {
if v, ok := res[n]; ok {
addGauge(d, float64(v.Value()))
}
}
addResource(descNodeStatusCapacityCPU, n.Status.Capacity, api.ResourceCPU)
addResource(descNodeStatusCapacityMemory, n.Status.Capacity, api.ResourceMemory)
addResource(descNodeStatusCapacityPods, n.Status.Capacity, api.ResourcePods)
addResource(descNodeStatusAllocateableCPU, n.Status.Allocatable, api.ResourceCPU)
addResource(descNodeStatusAllocateableMemory, n.Status.Allocatable, api.ResourceMemory)
addResource(descNodeStatusAllocateablePods, n.Status.Allocatable, api.ResourcePods)
}
// nodeStatusMetrics generates one metric for each possible node condition status.
func nodeStatusMetrics(ch chan<- prometheus.Metric, desc *prometheus.Desc, name string, cs api.ConditionStatus) {
// nodeConditionMetrics generates one metric for each possible node condition status.
func nodeConditionMetrics(ch chan<- prometheus.Metric, desc *prometheus.Desc, name string, cs api.ConditionStatus) {
ch <- prometheus.MustNewConstMetric(
desc, prometheus.GaugeValue, boolFloat64(cs == api.ConditionTrue),
name, "true",

View File

@ -20,6 +20,7 @@ import (
"testing"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
)
type mockNodeStore struct {
@ -38,12 +39,27 @@ func TestNodeCollector(t *testing.T) {
# TYPE node_info gauge
# HELP node_status_ready The ready status of a cluster node.
# TYPE node_status_ready gauge
# TYPE node_status_phase gauge
# HELP node_status_phase The phase the node is currently in.
# TYPE node_status_capacity_pods gauge
# HELP node_status_capacity_pods The total pod resources of the node.
# TYPE node_status_capacity_cpu_cores gauge
# HELP node_status_capacity_cpu_cores The total CPU resources of the node.
# TYPE node_status_capacity_memory_bytes gauge
# HELP node_status_capacity_memory_bytes The total memory resources of the node.
# TYPE node_status_allocateable_pods gauge
# HELP node_status_allocateable_pods The pod resources of a node that are available for scheduling.
# TYPE node_status_allocateable_cpu_cores gauge
# HELP node_status_allocateable_cpu_cores The CPU resources of a node that are available for scheduling.
# TYPE node_status_allocateable_memory_bytes gauge
# HELP node_status_allocateable_memory_bytes The memory resources of a node that are available for scheduling.
`
cases := []struct {
nodes []api.Node
want string
nodes []api.Node
metrics []string // which metrics should be checked
want string
}{
// Verify populating of node_info metric.
// Verify populating base metrics and that metrics for unset fields are skipped.
{
nodes: []api.Node{
{
@ -65,7 +81,45 @@ func TestNodeCollector(t *testing.T) {
node_info{container_runtime_version="rkt",kernel_version="kernel",kubelet_version="kubelet",kubeproxy_version="kubeproxy",node="127.0.0.1",os_image="osimage"} 1
`,
},
// Verify condition mappings to 1, 0, and NaN.
// Verify resource metrics.
{
nodes: []api.Node{
{
ObjectMeta: api.ObjectMeta{
Name: "127.0.0.1",
},
Status: api.NodeStatus{
NodeInfo: api.NodeSystemInfo{
KernelVersion: "kernel",
KubeletVersion: "kubelet",
KubeProxyVersion: "kubeproxy",
OSImage: "osimage",
ContainerRuntimeVersion: "rkt",
},
Capacity: api.ResourceList{
api.ResourceCPU: resource.MustParse("4"),
api.ResourceMemory: resource.MustParse("2G"),
api.ResourcePods: resource.MustParse("1000"),
},
Allocatable: api.ResourceList{
api.ResourceCPU: resource.MustParse("3"),
api.ResourceMemory: resource.MustParse("1G"),
api.ResourcePods: resource.MustParse("555"),
},
},
},
},
want: metadata + `
node_info{container_runtime_version="rkt",kernel_version="kernel",kubelet_version="kubelet",kubeproxy_version="kubeproxy",node="127.0.0.1",os_image="osimage"} 1
node_status_capacity_cpu_cores{node="127.0.0.1"} 4
node_status_capacity_memory_bytes{node="127.0.0.1"} 2e9
node_status_capacity_pods{node="127.0.0.1"} 1000
node_status_allocateable_cpu_cores{node="127.0.0.1"} 3
node_status_allocateable_memory_bytes{node="127.0.0.1"} 1e9
node_status_allocateable_pods{node="127.0.0.1"} 555
`,
},
// Verify condition enumerations.
{
nodes: []api.Node{
{
@ -109,10 +163,49 @@ func TestNodeCollector(t *testing.T) {
node_status_ready{node="127.0.0.3",condition="true"} 0
node_status_ready{node="127.0.0.3",condition="false"} 1
node_status_ready{node="127.0.0.3",condition="unknown"} 0
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.1",os_image=""} 1
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.2",os_image=""} 1
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.3",os_image=""} 1
`,
metrics: []string{"node_status_ready"},
},
// Verify phase enumerations.
{
nodes: []api.Node{
{
ObjectMeta: api.ObjectMeta{
Name: "127.0.0.1",
},
Status: api.NodeStatus{
Phase: api.NodeRunning,
},
},
{
ObjectMeta: api.ObjectMeta{
Name: "127.0.0.2",
},
Status: api.NodeStatus{
Phase: api.NodePending,
},
},
{
ObjectMeta: api.ObjectMeta{
Name: "127.0.0.3",
},
Status: api.NodeStatus{
Phase: api.NodeTerminated,
},
},
},
want: metadata + `
node_status_phase{node="127.0.0.1",phase="Terminated"} 0
node_status_phase{node="127.0.0.1",phase="Running"} 1
node_status_phase{node="127.0.0.1",phase="Pending"} 0
node_status_phase{node="127.0.0.2",phase="Terminated"} 0
node_status_phase{node="127.0.0.2",phase="Running"} 0
node_status_phase{node="127.0.0.2",phase="Pending"} 1
node_status_phase{node="127.0.0.3",phase="Terminated"} 1
node_status_phase{node="127.0.0.3",phase="Running"} 0
node_status_phase{node="127.0.0.3",phase="Pending"} 0
`,
metrics: []string{"node_status_phase"},
},
}
for _, c := range cases {
@ -123,7 +216,7 @@ func TestNodeCollector(t *testing.T) {
},
},
}
if err := gatherAndCompare(dc, c.want); err != nil {
if err := gatherAndCompare(dc, c.want, c.metrics); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}