From 22d372ba79bbe57fcb575660f2eb84b735b17f7f Mon Sep 17 00:00:00 2001 From: Poor12 Date: Thu, 27 Oct 2022 10:50:00 +0800 Subject: [PATCH] add metrics for cluster Signed-off-by: Poor12 --- cmd/agent/app/agent.go | 4 + .../app/controllermanager.go | 4 + .../status/cluster_status_controller.go | 7 + pkg/metrics/cluster.go | 136 +++++++ pkg/metrics/cluster_test.go | 343 ++++++++++++++++++ 5 files changed, 494 insertions(+) create mode 100644 pkg/metrics/cluster.go create mode 100644 pkg/metrics/cluster_test.go diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go index 9f3ea9717..ede364f54 100644 --- a/cmd/agent/app/agent.go +++ b/cmd/agent/app/agent.go @@ -19,6 +19,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/config/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/healthz" + crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "github.com/karmada-io/karmada/cmd/agent/app/options" clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" @@ -30,6 +31,7 @@ import ( "github.com/karmada-io/karmada/pkg/controllers/status" karmadaclientset "github.com/karmada-io/karmada/pkg/generated/clientset/versioned" "github.com/karmada-io/karmada/pkg/karmadactl/util/apiclient" + "github.com/karmada-io/karmada/pkg/metrics" "github.com/karmada-io/karmada/pkg/resourceinterpreter" "github.com/karmada-io/karmada/pkg/sharedcli" "github.com/karmada-io/karmada/pkg/sharedcli/klogflag" @@ -210,6 +212,8 @@ func run(ctx context.Context, opts *options.Options) error { return err } + crtlmetrics.Registry.MustRegister(metrics.ClusterCollectors()...) + if err = setupControllers(controllerManager, opts, ctx.Done()); err != nil { return err } diff --git a/cmd/controller-manager/app/controllermanager.go b/cmd/controller-manager/app/controllermanager.go index 15161f7e1..60c544c34 100644 --- a/cmd/controller-manager/app/controllermanager.go +++ b/cmd/controller-manager/app/controllermanager.go @@ -21,6 +21,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/config/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/healthz" + crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/predicate" "github.com/karmada-io/karmada/cmd/controller-manager/app/options" @@ -43,6 +44,7 @@ import ( "github.com/karmada-io/karmada/pkg/detector" "github.com/karmada-io/karmada/pkg/features" "github.com/karmada-io/karmada/pkg/karmadactl/util/apiclient" + "github.com/karmada-io/karmada/pkg/metrics" "github.com/karmada-io/karmada/pkg/resourceinterpreter" "github.com/karmada-io/karmada/pkg/sharedcli" "github.com/karmada-io/karmada/pkg/sharedcli/klogflag" @@ -154,6 +156,8 @@ func Run(ctx context.Context, opts *options.Options) error { return err } + crtlmetrics.Registry.MustRegister(metrics.ClusterCollectors()...) + setupControllers(controllerManager, opts, ctx.Done()) // blocks until the context is done. diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go index bf15690a6..73e522020 100644 --- a/pkg/controllers/status/cluster_status_controller.go +++ b/pkg/controllers/status/cluster_status_controller.go @@ -32,6 +32,7 @@ import ( clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" "github.com/karmada-io/karmada/pkg/features" + "github.com/karmada-io/karmada/pkg/metrics" "github.com/karmada-io/karmada/pkg/modeling" "github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag" "github.com/karmada-io/karmada/pkg/util" @@ -153,6 +154,12 @@ func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager } func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Cluster) (controllerruntime.Result, error) { + start := time.Now() + defer func() { + metrics.RecordClusterStatus(cluster) + metrics.RecordClusterSyncStatusDuration(cluster, start) + }() + currentClusterStatus := *cluster.Status.DeepCopy() // create a ClusterClient for the given member cluster diff --git a/pkg/metrics/cluster.go b/pkg/metrics/cluster.go new file mode 100644 index 000000000..3805a36cb --- /dev/null +++ b/pkg/metrics/cluster.go @@ -0,0 +1,136 @@ +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" + "github.com/karmada-io/karmada/pkg/util" + utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics" +) + +const ( + clusterReadyMetricsName = "cluster_ready_state" + clusterTotalNodeNumberMetricsName = "cluster_node_number" + clusterReadyNodeNumberMetricsName = "cluster_ready_node_number" + clusterMemoryAllocatableMetricsName = "cluster_memory_allocatable_bytes" + clusterCPUAllocatableMetricsName = "cluster_cpu_allocatable_number" + clusterPodAllocatableMetricsName = "cluster_pod_allocatable_number" + clusterMemoryAllocatedMetricsName = "cluster_memory_allocated_bytes" + clusterCPUAllocatedMetricsName = "cluster_cpu_allocated_number" + clusterPodAllocatedMetricsName = "cluster_pod_allocated_number" + clusterSyncStatusDurationMetricsName = "cluster_sync_status_duration_seconds" +) + +var ( + // clusterReadyGauge reports if the cluster is ready. + clusterReadyGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterReadyMetricsName, + Help: "State of the cluster(1 if ready, 0 otherwise).", + }, []string{"cluster_name"}) + + // clusterTotalNodeNumberGauge reports the number of nodes in the given cluster. + clusterTotalNodeNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterTotalNodeNumberMetricsName, + Help: "Number of nodes in the cluster.", + }, []string{"cluster_name"}) + + // clusterReadyNodeNumberGauge reports the number of ready nodes in the given cluster. + clusterReadyNodeNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterReadyNodeNumberMetricsName, + Help: "Number of ready nodes in the cluster.", + }, []string{"cluster_name"}) + + // clusterMemoryAllocatableGauge reports the allocatable memory in the given cluster. + clusterMemoryAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterMemoryAllocatableMetricsName, + Help: "Allocatable cluster memory resource in bytes.", + }, []string{"cluster_name"}) + + // clusterCPUAllocatableGauge reports the allocatable CPU in the given cluster. + clusterCPUAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterCPUAllocatableMetricsName, + Help: "Number of allocatable CPU in the cluster.", + }, []string{"cluster_name"}) + + // clusterPodAllocatableGauge reports the allocatable Pod number in the given cluster. + clusterPodAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterPodAllocatableMetricsName, + Help: "Number of allocatable pods in the cluster.", + }, []string{"cluster_name"}) + + // clusterMemoryAllocatedGauge reports the allocated memory in the given cluster. + clusterMemoryAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterMemoryAllocatedMetricsName, + Help: "Allocated cluster memory resource in bytes.", + }, []string{"cluster_name"}) + + // clusterCPUAllocatedGauge reports the allocated CPU in the given cluster. + clusterCPUAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterCPUAllocatedMetricsName, + Help: "Number of allocated CPU in the cluster.", + }, []string{"cluster_name"}) + + // clusterPodAllocatedGauge reports the allocated Pod number in the given cluster. + clusterPodAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: clusterPodAllocatedMetricsName, + Help: "Number of allocated pods in the cluster.", + }, []string{"cluster_name"}) + + // clusterSyncStatusDuration reports the duration of the given cluster syncing status. + clusterSyncStatusDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: clusterSyncStatusDurationMetricsName, + Help: "Duration in seconds for syncing the status of the cluster once.", + }, []string{"cluster_name"}) +) + +// RecordClusterStatus records the status of the given cluster. +func RecordClusterStatus(cluster *v1alpha1.Cluster) { + clusterReadyGauge.WithLabelValues(cluster.Name).Set(func() float64 { + if util.IsClusterReady(&cluster.Status) { + return 1 + } + return 0 + }()) + + if cluster.Status.NodeSummary != nil { + clusterTotalNodeNumberGauge.WithLabelValues(cluster.Name).Set(float64(cluster.Status.NodeSummary.TotalNum)) + clusterReadyNodeNumberGauge.WithLabelValues(cluster.Name).Set(float64(cluster.Status.NodeSummary.ReadyNum)) + } + + if cluster.Status.ResourceSummary != nil { + if cluster.Status.ResourceSummary.Allocatable != nil { + clusterMemoryAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Memory().AsApproximateFloat64()) + clusterCPUAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Cpu().AsApproximateFloat64()) + clusterPodAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Pods().AsApproximateFloat64()) + } + + if cluster.Status.ResourceSummary.Allocated != nil { + clusterMemoryAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Memory().AsApproximateFloat64()) + clusterCPUAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Cpu().AsApproximateFloat64()) + clusterPodAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Pods().AsApproximateFloat64()) + } + } +} + +// RecordClusterSyncStatusDuration records the duration of the given cluster syncing status +func RecordClusterSyncStatusDuration(cluster *v1alpha1.Cluster, startTime time.Time) { + clusterSyncStatusDuration.WithLabelValues(cluster.Name).Observe(utilmetrics.DurationInSeconds(startTime)) +} + +// ClusterCollectors returns the collectors about clusters. +func ClusterCollectors() []prometheus.Collector { + return []prometheus.Collector{ + clusterReadyGauge, + clusterTotalNodeNumberGauge, + clusterReadyNodeNumberGauge, + clusterMemoryAllocatableGauge, + clusterCPUAllocatableGauge, + clusterPodAllocatableGauge, + clusterMemoryAllocatedGauge, + clusterCPUAllocatedGauge, + clusterPodAllocatedGauge, + clusterSyncStatusDuration, + } +} diff --git a/pkg/metrics/cluster_test.go b/pkg/metrics/cluster_test.go new file mode 100644 index 000000000..54ce1a73e --- /dev/null +++ b/pkg/metrics/cluster_test.go @@ -0,0 +1,343 @@ +package metrics + +import ( + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/component-base/metrics/testutil" + + clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" +) + +func TestClusterReadyMetrics(t *testing.T) { + tests := []struct { + name string + cluster *clusterv1alpha1.Cluster + want string + }{ + { + name: "cluster ready", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + want: ` +# HELP cluster_ready_state State of the cluster(1 if ready, 0 otherwise). +# TYPE cluster_ready_state gauge +cluster_ready_state{cluster_name="foo"} 1 +`, + }, + { + name: "cluster not ready", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + {}, + }, + }, + }, + want: ` +# HELP cluster_ready_state State of the cluster(1 if ready, 0 otherwise). +# TYPE cluster_ready_state gauge +cluster_ready_state{cluster_name="foo"} 0 +`, + }, + } + for _, test := range tests { + clusterReadyGauge.Reset() + RecordClusterStatus(test.cluster) + if err := testutil.CollectAndCompare(clusterReadyGauge, strings.NewReader(test.want), clusterReadyMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } + } +} + +func TestClusterTotalNodeNumberMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + NodeSummary: &clusterv1alpha1.NodeSummary{ + TotalNum: 100, + }, + }, + } + want := ` +# HELP cluster_node_number Number of nodes in the cluster. +# TYPE cluster_node_number gauge +cluster_node_number{cluster_name="foo"} 100 +` + clusterTotalNodeNumberGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterTotalNodeNumberGauge, strings.NewReader(want), clusterTotalNodeNumberMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterReadyNodeNumberMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + NodeSummary: &clusterv1alpha1.NodeSummary{ + TotalNum: 100, + ReadyNum: 10, + }, + }, + } + want := ` +# HELP cluster_ready_node_number Number of ready nodes in the cluster. +# TYPE cluster_ready_node_number gauge +cluster_ready_node_number{cluster_name="foo"} 10 +` + clusterReadyNodeNumberGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterReadyNodeNumberGauge, strings.NewReader(want), clusterReadyNodeNumberMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterMemoryAllocatableMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocatable: corev1.ResourceList{ + corev1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_memory_allocatable_bytes Allocatable cluster memory resource in bytes. +# TYPE cluster_memory_allocatable_bytes gauge +cluster_memory_allocatable_bytes{cluster_name="foo"} 200 +` + clusterMemoryAllocatableGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterMemoryAllocatableGauge, strings.NewReader(want), clusterMemoryAllocatableMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterCPUAllocatableMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_cpu_allocatable_number Number of allocatable CPU in the cluster. +# TYPE cluster_cpu_allocatable_number gauge +cluster_cpu_allocatable_number{cluster_name="foo"} 0.2 +` + clusterCPUAllocatableGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterCPUAllocatableGauge, strings.NewReader(want), clusterCPUAllocatableMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterPodAllocatableMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocatable: corev1.ResourceList{ + corev1.ResourcePods: *resource.NewQuantity(110, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_pod_allocatable_number Number of allocatable pods in the cluster. +# TYPE cluster_pod_allocatable_number gauge +cluster_pod_allocatable_number{cluster_name="foo"} 110 +` + clusterPodAllocatableGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterPodAllocatableGauge, strings.NewReader(want), clusterPodAllocatableMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterMemoryAllocatedMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocated: corev1.ResourceList{ + corev1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_memory_allocated_bytes Allocated cluster memory resource in bytes. +# TYPE cluster_memory_allocated_bytes gauge +cluster_memory_allocated_bytes{cluster_name="foo"} 200 +` + clusterMemoryAllocatedGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterMemoryAllocatedGauge, strings.NewReader(want), clusterMemoryAllocatedMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterCPUAllocatedMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocated: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_cpu_allocated_number Number of allocated CPU in the cluster. +# TYPE cluster_cpu_allocated_number gauge +cluster_cpu_allocated_number{cluster_name="foo"} 0.2 +` + clusterCPUAllocatedGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterCPUAllocatedGauge, strings.NewReader(want), clusterCPUAllocatedMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestClusterPodAllocatedMetrics(t *testing.T) { + testCluster := &clusterv1alpha1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Push, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + ResourceSummary: &clusterv1alpha1.ResourceSummary{ + Allocated: corev1.ResourceList{ + corev1.ResourcePods: *resource.NewQuantity(110, resource.DecimalSI), + }, + }, + }, + } + want := ` +# HELP cluster_pod_allocated_number Number of allocated pods in the cluster. +# TYPE cluster_pod_allocated_number gauge +cluster_pod_allocated_number{cluster_name="foo"} 110 +` + clusterPodAllocatedGauge.Reset() + RecordClusterStatus(testCluster) + if err := testutil.CollectAndCompare(clusterPodAllocatedGauge, strings.NewReader(want), clusterPodAllocatedMetricsName); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +}