Add more node metrics.
This adds metrics on the current node phase, allocatable and capacity resources, and the out-of-disk condition
This commit is contained in:
parent
a7da0baf45
commit
15d8fccf53
|
|
@ -28,6 +28,14 @@ additional metrics!
|
|||
| ---------- | ----------- | ----------- |
|
||||
| node_info | Gauge | `node`=<node-address> <br> `kernel_version`=<kernel-version> <br> `os_image`=<os-image-name> <br> `container_runtime_version`=<container-runtime-and-version-combination> <br> `kubelet_version`=<kubelet-version> <br> `kubeproxy_version`=<kubeproxy-version> |
|
||||
| node_status_ready| Gauge | `node`=<node-address> <br> `condition`=<true\|false\|unknown> |
|
||||
| node_status_out_of_disk | Gauge | `node`=<node-address> <br> `condition`=<true\|false\|unknown> |
|
||||
| node_status_phase| Gauge | `node`=<node-address> <br> `phase`=<Pending|Running|Terminated> |
|
||||
| node_status_capacity_cpu_cores | Gauge | `node`=<node-address>|
|
||||
| node_status_capacity_memory_bytes | Gauge | `node`=<node-address>|
|
||||
| node_status_capacity_pods | Gauge | `node`=<node-address>|
|
||||
| node_status_allocateable_cpu_cores | Gauge | `node`=<node-address>|
|
||||
| node_status_allocateable_memory_bytes | Gauge | `node`=<node-address>|
|
||||
| node_status_allocateable_pods | Gauge | `node`=<node-address>|
|
||||
| deployment_replicas | Gauge | `deployment`=<deployment-name> <br> `namespace`=<deployment-namespace> |
|
||||
| deployment_replicas_available | Gauge | `deployment`=<deployment-name> <br> `namespace`=<deployment-namespace> |
|
||||
| pod_container_restarts | Counter | `container`=<container-name> <br> `namespace`=<pod-namespace> <br> `pod`=<pod-name> |
|
||||
|
|
|
|||
101
node.go
101
node.go
|
|
@ -41,6 +41,48 @@ var (
|
|||
"The ready status of a cluster node.",
|
||||
[]string{"node", "condition"}, nil,
|
||||
)
|
||||
descNodeStatusOutOfDisk = prometheus.NewDesc(
|
||||
"node_status_out_of_disk",
|
||||
"Whether the node is out of disk space",
|
||||
[]string{"node", "condition"}, nil,
|
||||
)
|
||||
descNodeStatusPhase = prometheus.NewDesc(
|
||||
"node_status_phase",
|
||||
"The phase the node is currently in.",
|
||||
[]string{"node", "phase"}, nil,
|
||||
)
|
||||
|
||||
descNodeStatusCapacityPods = prometheus.NewDesc(
|
||||
"node_status_capacity_pods",
|
||||
"The total pod resources of the node.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
descNodeStatusCapacityCPU = prometheus.NewDesc(
|
||||
"node_status_capacity_cpu_cores",
|
||||
"The total CPU resources of the node.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
descNodeStatusCapacityMemory = prometheus.NewDesc(
|
||||
"node_status_capacity_memory_bytes",
|
||||
"The total memory resources of the node.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
|
||||
descNodeStatusAllocateablePods = prometheus.NewDesc(
|
||||
"node_status_allocateable_pods",
|
||||
"The pod resources of a node that are available for scheduling.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
descNodeStatusAllocateableCPU = prometheus.NewDesc(
|
||||
"node_status_allocateable_cpu_cores",
|
||||
"The CPU resources of a node that are available for scheduling.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
descNodeStatusAllocateableMemory = prometheus.NewDesc(
|
||||
"node_status_allocateable_memory_bytes",
|
||||
"The memory resources of a node that are available for scheduling.",
|
||||
[]string{"node"}, nil,
|
||||
)
|
||||
)
|
||||
|
||||
type nodeStore interface {
|
||||
|
|
@ -56,6 +98,14 @@ type nodeCollector struct {
|
|||
func (nc *nodeCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- descNodeInfo
|
||||
ch <- descNodeStatusReady
|
||||
ch <- descNodeStatusOutOfDisk
|
||||
ch <- descNodeStatusPhase
|
||||
ch <- descNodeStatusCapacityCPU
|
||||
ch <- descNodeStatusCapacityMemory
|
||||
ch <- descNodeStatusCapacityPods
|
||||
ch <- descNodeStatusAllocateableCPU
|
||||
ch <- descNodeStatusAllocateableMemory
|
||||
ch <- descNodeStatusAllocateablePods
|
||||
}
|
||||
|
||||
// Collect implements the prometheus.Collector interface.
|
||||
|
|
@ -71,30 +121,55 @@ func (nc *nodeCollector) Collect(ch chan<- prometheus.Metric) {
|
|||
}
|
||||
|
||||
func (nc *nodeCollector) collectNode(ch chan<- prometheus.Metric, n api.Node) {
|
||||
// Collect node conditions and while default to false.
|
||||
// TODO(fabxc): add remaining conditions: NodeOutOfDisk, NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
|
||||
for _, c := range n.Status.Conditions {
|
||||
switch c.Type {
|
||||
case api.NodeReady:
|
||||
nodeStatusMetrics(ch, descNodeStatusReady, n.Name, c.Status)
|
||||
}
|
||||
addGauge := func(desc *prometheus.Desc, v float64, lv ...string) {
|
||||
lv = append([]string{n.Name}, lv...)
|
||||
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
|
||||
}
|
||||
|
||||
// NOTE: the instrumentation API requires providing label values in order of declaration
|
||||
// in the metric descriptor. Be careful when making modifications.
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
descNodeInfo, prometheus.GaugeValue, 1,
|
||||
n.Name,
|
||||
addGauge(descNodeInfo, 1,
|
||||
n.Status.NodeInfo.KernelVersion,
|
||||
n.Status.NodeInfo.OSImage,
|
||||
n.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
n.Status.NodeInfo.KubeletVersion,
|
||||
n.Status.NodeInfo.KubeProxyVersion,
|
||||
)
|
||||
|
||||
// Collect node conditions and while default to false.
|
||||
// TODO(fabxc): add remaining conditions: NodeMemoryPressure, NodeDiskPressure, NodeNetworkUnavailable
|
||||
for _, c := range n.Status.Conditions {
|
||||
switch c.Type {
|
||||
case api.NodeReady:
|
||||
nodeConditionMetrics(ch, descNodeStatusReady, n.Name, c.Status)
|
||||
case api.NodeOutOfDisk:
|
||||
nodeConditionMetrics(ch, descNodeStatusOutOfDisk, n.Name, c.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// Set current phase to 1, others to 0 if it is set.
|
||||
if p := n.Status.Phase; p != "" {
|
||||
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodePending), string(api.NodePending))
|
||||
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodeRunning), string(api.NodeRunning))
|
||||
addGauge(descNodeStatusPhase, boolFloat64(p == api.NodeTerminated), string(api.NodeTerminated))
|
||||
}
|
||||
|
||||
// Add capacity and allocateable resources if they are set.
|
||||
addResource := func(d *prometheus.Desc, res api.ResourceList, n api.ResourceName) {
|
||||
if v, ok := res[n]; ok {
|
||||
addGauge(d, float64(v.Value()))
|
||||
}
|
||||
}
|
||||
addResource(descNodeStatusCapacityCPU, n.Status.Capacity, api.ResourceCPU)
|
||||
addResource(descNodeStatusCapacityMemory, n.Status.Capacity, api.ResourceMemory)
|
||||
addResource(descNodeStatusCapacityPods, n.Status.Capacity, api.ResourcePods)
|
||||
|
||||
addResource(descNodeStatusAllocateableCPU, n.Status.Allocatable, api.ResourceCPU)
|
||||
addResource(descNodeStatusAllocateableMemory, n.Status.Allocatable, api.ResourceMemory)
|
||||
addResource(descNodeStatusAllocateablePods, n.Status.Allocatable, api.ResourcePods)
|
||||
}
|
||||
|
||||
// nodeStatusMetrics generates one metric for each possible node condition status.
|
||||
func nodeStatusMetrics(ch chan<- prometheus.Metric, desc *prometheus.Desc, name string, cs api.ConditionStatus) {
|
||||
// nodeConditionMetrics generates one metric for each possible node condition status.
|
||||
func nodeConditionMetrics(ch chan<- prometheus.Metric, desc *prometheus.Desc, name string, cs api.ConditionStatus) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
desc, prometheus.GaugeValue, boolFloat64(cs == api.ConditionTrue),
|
||||
name, "true",
|
||||
|
|
|
|||
109
node_test.go
109
node_test.go
|
|
@ -20,6 +20,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/resource"
|
||||
)
|
||||
|
||||
type mockNodeStore struct {
|
||||
|
|
@ -38,12 +39,27 @@ func TestNodeCollector(t *testing.T) {
|
|||
# TYPE node_info gauge
|
||||
# HELP node_status_ready The ready status of a cluster node.
|
||||
# TYPE node_status_ready gauge
|
||||
# TYPE node_status_phase gauge
|
||||
# HELP node_status_phase The phase the node is currently in.
|
||||
# TYPE node_status_capacity_pods gauge
|
||||
# HELP node_status_capacity_pods The total pod resources of the node.
|
||||
# TYPE node_status_capacity_cpu_cores gauge
|
||||
# HELP node_status_capacity_cpu_cores The total CPU resources of the node.
|
||||
# TYPE node_status_capacity_memory_bytes gauge
|
||||
# HELP node_status_capacity_memory_bytes The total memory resources of the node.
|
||||
# TYPE node_status_allocateable_pods gauge
|
||||
# HELP node_status_allocateable_pods The pod resources of a node that are available for scheduling.
|
||||
# TYPE node_status_allocateable_cpu_cores gauge
|
||||
# HELP node_status_allocateable_cpu_cores The CPU resources of a node that are available for scheduling.
|
||||
# TYPE node_status_allocateable_memory_bytes gauge
|
||||
# HELP node_status_allocateable_memory_bytes The memory resources of a node that are available for scheduling.
|
||||
`
|
||||
cases := []struct {
|
||||
nodes []api.Node
|
||||
want string
|
||||
nodes []api.Node
|
||||
metrics []string // which metrics should be checked
|
||||
want string
|
||||
}{
|
||||
// Verify populating of node_info metric.
|
||||
// Verify populating base metrics and that metrics for unset fields are skipped.
|
||||
{
|
||||
nodes: []api.Node{
|
||||
{
|
||||
|
|
@ -65,7 +81,45 @@ func TestNodeCollector(t *testing.T) {
|
|||
node_info{container_runtime_version="rkt",kernel_version="kernel",kubelet_version="kubelet",kubeproxy_version="kubeproxy",node="127.0.0.1",os_image="osimage"} 1
|
||||
`,
|
||||
},
|
||||
// Verify condition mappings to 1, 0, and NaN.
|
||||
// Verify resource metrics.
|
||||
{
|
||||
nodes: []api.Node{
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "127.0.0.1",
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
NodeInfo: api.NodeSystemInfo{
|
||||
KernelVersion: "kernel",
|
||||
KubeletVersion: "kubelet",
|
||||
KubeProxyVersion: "kubeproxy",
|
||||
OSImage: "osimage",
|
||||
ContainerRuntimeVersion: "rkt",
|
||||
},
|
||||
Capacity: api.ResourceList{
|
||||
api.ResourceCPU: resource.MustParse("4"),
|
||||
api.ResourceMemory: resource.MustParse("2G"),
|
||||
api.ResourcePods: resource.MustParse("1000"),
|
||||
},
|
||||
Allocatable: api.ResourceList{
|
||||
api.ResourceCPU: resource.MustParse("3"),
|
||||
api.ResourceMemory: resource.MustParse("1G"),
|
||||
api.ResourcePods: resource.MustParse("555"),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
want: metadata + `
|
||||
node_info{container_runtime_version="rkt",kernel_version="kernel",kubelet_version="kubelet",kubeproxy_version="kubeproxy",node="127.0.0.1",os_image="osimage"} 1
|
||||
node_status_capacity_cpu_cores{node="127.0.0.1"} 4
|
||||
node_status_capacity_memory_bytes{node="127.0.0.1"} 2e9
|
||||
node_status_capacity_pods{node="127.0.0.1"} 1000
|
||||
node_status_allocateable_cpu_cores{node="127.0.0.1"} 3
|
||||
node_status_allocateable_memory_bytes{node="127.0.0.1"} 1e9
|
||||
node_status_allocateable_pods{node="127.0.0.1"} 555
|
||||
`,
|
||||
},
|
||||
// Verify condition enumerations.
|
||||
{
|
||||
nodes: []api.Node{
|
||||
{
|
||||
|
|
@ -109,10 +163,49 @@ func TestNodeCollector(t *testing.T) {
|
|||
node_status_ready{node="127.0.0.3",condition="true"} 0
|
||||
node_status_ready{node="127.0.0.3",condition="false"} 1
|
||||
node_status_ready{node="127.0.0.3",condition="unknown"} 0
|
||||
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.1",os_image=""} 1
|
||||
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.2",os_image=""} 1
|
||||
node_info{container_runtime_version="",kernel_version="",kubelet_version="",kubeproxy_version="",node="127.0.0.3",os_image=""} 1
|
||||
`,
|
||||
metrics: []string{"node_status_ready"},
|
||||
},
|
||||
// Verify phase enumerations.
|
||||
{
|
||||
nodes: []api.Node{
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "127.0.0.1",
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
Phase: api.NodeRunning,
|
||||
},
|
||||
},
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "127.0.0.2",
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
Phase: api.NodePending,
|
||||
},
|
||||
},
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "127.0.0.3",
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
Phase: api.NodeTerminated,
|
||||
},
|
||||
},
|
||||
},
|
||||
want: metadata + `
|
||||
node_status_phase{node="127.0.0.1",phase="Terminated"} 0
|
||||
node_status_phase{node="127.0.0.1",phase="Running"} 1
|
||||
node_status_phase{node="127.0.0.1",phase="Pending"} 0
|
||||
node_status_phase{node="127.0.0.2",phase="Terminated"} 0
|
||||
node_status_phase{node="127.0.0.2",phase="Running"} 0
|
||||
node_status_phase{node="127.0.0.2",phase="Pending"} 1
|
||||
node_status_phase{node="127.0.0.3",phase="Terminated"} 1
|
||||
node_status_phase{node="127.0.0.3",phase="Running"} 0
|
||||
node_status_phase{node="127.0.0.3",phase="Pending"} 0
|
||||
`,
|
||||
metrics: []string{"node_status_phase"},
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
|
|
@ -123,7 +216,7 @@ func TestNodeCollector(t *testing.T) {
|
|||
},
|
||||
},
|
||||
}
|
||||
if err := gatherAndCompare(dc, c.want); err != nil {
|
||||
if err := gatherAndCompare(dc, c.want, c.metrics); err != nil {
|
||||
t.Errorf("unexpected collecting result:\n%s", err)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue