Merge pull request #2496 from Poor12/controller-metrics

Add metrics for cluster
This commit is contained in:
karmada-bot 2022-11-25 09:25:08 +08:00 committed by GitHub
commit 6ef120f6e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 494 additions and 0 deletions

View File

@ -19,6 +19,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/config/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/healthz"
crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"github.com/karmada-io/karmada/cmd/agent/app/options"
clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
@ -30,6 +31,7 @@ import (
"github.com/karmada-io/karmada/pkg/controllers/status"
karmadaclientset "github.com/karmada-io/karmada/pkg/generated/clientset/versioned"
"github.com/karmada-io/karmada/pkg/karmadactl/util/apiclient"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli"
"github.com/karmada-io/karmada/pkg/sharedcli/klogflag"
@ -210,6 +212,8 @@ func run(ctx context.Context, opts *options.Options) error {
return err
}
crtlmetrics.Registry.MustRegister(metrics.ClusterCollectors()...)
if err = setupControllers(controllerManager, opts, ctx.Done()); err != nil {
return err
}

View File

@ -21,6 +21,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/config/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/healthz"
crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"github.com/karmada-io/karmada/cmd/controller-manager/app/options"
@ -43,6 +44,7 @@ import (
"github.com/karmada-io/karmada/pkg/detector"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/karmadactl/util/apiclient"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli"
"github.com/karmada-io/karmada/pkg/sharedcli/klogflag"
@ -154,6 +156,8 @@ func Run(ctx context.Context, opts *options.Options) error {
return err
}
crtlmetrics.Registry.MustRegister(metrics.ClusterCollectors()...)
setupControllers(controllerManager, opts, ctx.Done())
// blocks until the context is done.

View File

@ -32,6 +32,7 @@ import (
clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/modeling"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
"github.com/karmada-io/karmada/pkg/util"
@ -153,6 +154,12 @@ func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager
}
func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Cluster) (controllerruntime.Result, error) {
start := time.Now()
defer func() {
metrics.RecordClusterStatus(cluster)
metrics.RecordClusterSyncStatusDuration(cluster, start)
}()
currentClusterStatus := *cluster.Status.DeepCopy()
// create a ClusterClient for the given member cluster

136
pkg/metrics/cluster.go Normal file
View File

@ -0,0 +1,136 @@
package metrics
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
"github.com/karmada-io/karmada/pkg/util"
utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
)
const (
clusterReadyMetricsName = "cluster_ready_state"
clusterTotalNodeNumberMetricsName = "cluster_node_number"
clusterReadyNodeNumberMetricsName = "cluster_ready_node_number"
clusterMemoryAllocatableMetricsName = "cluster_memory_allocatable_bytes"
clusterCPUAllocatableMetricsName = "cluster_cpu_allocatable_number"
clusterPodAllocatableMetricsName = "cluster_pod_allocatable_number"
clusterMemoryAllocatedMetricsName = "cluster_memory_allocated_bytes"
clusterCPUAllocatedMetricsName = "cluster_cpu_allocated_number"
clusterPodAllocatedMetricsName = "cluster_pod_allocated_number"
clusterSyncStatusDurationMetricsName = "cluster_sync_status_duration_seconds"
)
var (
// clusterReadyGauge reports if the cluster is ready.
clusterReadyGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterReadyMetricsName,
Help: "State of the cluster(1 if ready, 0 otherwise).",
}, []string{"cluster_name"})
// clusterTotalNodeNumberGauge reports the number of nodes in the given cluster.
clusterTotalNodeNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterTotalNodeNumberMetricsName,
Help: "Number of nodes in the cluster.",
}, []string{"cluster_name"})
// clusterReadyNodeNumberGauge reports the number of ready nodes in the given cluster.
clusterReadyNodeNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterReadyNodeNumberMetricsName,
Help: "Number of ready nodes in the cluster.",
}, []string{"cluster_name"})
// clusterMemoryAllocatableGauge reports the allocatable memory in the given cluster.
clusterMemoryAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterMemoryAllocatableMetricsName,
Help: "Allocatable cluster memory resource in bytes.",
}, []string{"cluster_name"})
// clusterCPUAllocatableGauge reports the allocatable CPU in the given cluster.
clusterCPUAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterCPUAllocatableMetricsName,
Help: "Number of allocatable CPU in the cluster.",
}, []string{"cluster_name"})
// clusterPodAllocatableGauge reports the allocatable Pod number in the given cluster.
clusterPodAllocatableGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterPodAllocatableMetricsName,
Help: "Number of allocatable pods in the cluster.",
}, []string{"cluster_name"})
// clusterMemoryAllocatedGauge reports the allocated memory in the given cluster.
clusterMemoryAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterMemoryAllocatedMetricsName,
Help: "Allocated cluster memory resource in bytes.",
}, []string{"cluster_name"})
// clusterCPUAllocatedGauge reports the allocated CPU in the given cluster.
clusterCPUAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterCPUAllocatedMetricsName,
Help: "Number of allocated CPU in the cluster.",
}, []string{"cluster_name"})
// clusterPodAllocatedGauge reports the allocated Pod number in the given cluster.
clusterPodAllocatedGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: clusterPodAllocatedMetricsName,
Help: "Number of allocated pods in the cluster.",
}, []string{"cluster_name"})
// clusterSyncStatusDuration reports the duration of the given cluster syncing status.
clusterSyncStatusDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: clusterSyncStatusDurationMetricsName,
Help: "Duration in seconds for syncing the status of the cluster once.",
}, []string{"cluster_name"})
)
// RecordClusterStatus records the status of the given cluster.
func RecordClusterStatus(cluster *v1alpha1.Cluster) {
clusterReadyGauge.WithLabelValues(cluster.Name).Set(func() float64 {
if util.IsClusterReady(&cluster.Status) {
return 1
}
return 0
}())
if cluster.Status.NodeSummary != nil {
clusterTotalNodeNumberGauge.WithLabelValues(cluster.Name).Set(float64(cluster.Status.NodeSummary.TotalNum))
clusterReadyNodeNumberGauge.WithLabelValues(cluster.Name).Set(float64(cluster.Status.NodeSummary.ReadyNum))
}
if cluster.Status.ResourceSummary != nil {
if cluster.Status.ResourceSummary.Allocatable != nil {
clusterMemoryAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Memory().AsApproximateFloat64())
clusterCPUAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Cpu().AsApproximateFloat64())
clusterPodAllocatableGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocatable.Pods().AsApproximateFloat64())
}
if cluster.Status.ResourceSummary.Allocated != nil {
clusterMemoryAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Memory().AsApproximateFloat64())
clusterCPUAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Cpu().AsApproximateFloat64())
clusterPodAllocatedGauge.WithLabelValues(cluster.Name).Set(cluster.Status.ResourceSummary.Allocated.Pods().AsApproximateFloat64())
}
}
}
// RecordClusterSyncStatusDuration records the duration of the given cluster syncing status
func RecordClusterSyncStatusDuration(cluster *v1alpha1.Cluster, startTime time.Time) {
clusterSyncStatusDuration.WithLabelValues(cluster.Name).Observe(utilmetrics.DurationInSeconds(startTime))
}
// ClusterCollectors returns the collectors about clusters.
func ClusterCollectors() []prometheus.Collector {
return []prometheus.Collector{
clusterReadyGauge,
clusterTotalNodeNumberGauge,
clusterReadyNodeNumberGauge,
clusterMemoryAllocatableGauge,
clusterCPUAllocatableGauge,
clusterPodAllocatableGauge,
clusterMemoryAllocatedGauge,
clusterCPUAllocatedGauge,
clusterPodAllocatedGauge,
clusterSyncStatusDuration,
}
}

343
pkg/metrics/cluster_test.go Normal file
View File

@ -0,0 +1,343 @@
package metrics
import (
"strings"
"testing"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/component-base/metrics/testutil"
clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
)
func TestClusterReadyMetrics(t *testing.T) {
tests := []struct {
name string
cluster *clusterv1alpha1.Cluster
want string
}{
{
name: "cluster ready",
cluster: &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
},
},
want: `
# HELP cluster_ready_state State of the cluster(1 if ready, 0 otherwise).
# TYPE cluster_ready_state gauge
cluster_ready_state{cluster_name="foo"} 1
`,
},
{
name: "cluster not ready",
cluster: &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Pull,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{},
},
},
},
want: `
# HELP cluster_ready_state State of the cluster(1 if ready, 0 otherwise).
# TYPE cluster_ready_state gauge
cluster_ready_state{cluster_name="foo"} 0
`,
},
}
for _, test := range tests {
clusterReadyGauge.Reset()
RecordClusterStatus(test.cluster)
if err := testutil.CollectAndCompare(clusterReadyGauge, strings.NewReader(test.want), clusterReadyMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
}
func TestClusterTotalNodeNumberMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
NodeSummary: &clusterv1alpha1.NodeSummary{
TotalNum: 100,
},
},
}
want := `
# HELP cluster_node_number Number of nodes in the cluster.
# TYPE cluster_node_number gauge
cluster_node_number{cluster_name="foo"} 100
`
clusterTotalNodeNumberGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterTotalNodeNumberGauge, strings.NewReader(want), clusterTotalNodeNumberMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterReadyNodeNumberMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
NodeSummary: &clusterv1alpha1.NodeSummary{
TotalNum: 100,
ReadyNum: 10,
},
},
}
want := `
# HELP cluster_ready_node_number Number of ready nodes in the cluster.
# TYPE cluster_ready_node_number gauge
cluster_ready_node_number{cluster_name="foo"} 10
`
clusterReadyNodeNumberGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterReadyNodeNumberGauge, strings.NewReader(want), clusterReadyNodeNumberMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterMemoryAllocatableMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocatable: corev1.ResourceList{
corev1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_memory_allocatable_bytes Allocatable cluster memory resource in bytes.
# TYPE cluster_memory_allocatable_bytes gauge
cluster_memory_allocatable_bytes{cluster_name="foo"} 200
`
clusterMemoryAllocatableGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterMemoryAllocatableGauge, strings.NewReader(want), clusterMemoryAllocatableMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterCPUAllocatableMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocatable: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_cpu_allocatable_number Number of allocatable CPU in the cluster.
# TYPE cluster_cpu_allocatable_number gauge
cluster_cpu_allocatable_number{cluster_name="foo"} 0.2
`
clusterCPUAllocatableGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterCPUAllocatableGauge, strings.NewReader(want), clusterCPUAllocatableMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterPodAllocatableMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocatable: corev1.ResourceList{
corev1.ResourcePods: *resource.NewQuantity(110, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_pod_allocatable_number Number of allocatable pods in the cluster.
# TYPE cluster_pod_allocatable_number gauge
cluster_pod_allocatable_number{cluster_name="foo"} 110
`
clusterPodAllocatableGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterPodAllocatableGauge, strings.NewReader(want), clusterPodAllocatableMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterMemoryAllocatedMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocated: corev1.ResourceList{
corev1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_memory_allocated_bytes Allocated cluster memory resource in bytes.
# TYPE cluster_memory_allocated_bytes gauge
cluster_memory_allocated_bytes{cluster_name="foo"} 200
`
clusterMemoryAllocatedGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterMemoryAllocatedGauge, strings.NewReader(want), clusterMemoryAllocatedMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterCPUAllocatedMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocated: corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_cpu_allocated_number Number of allocated CPU in the cluster.
# TYPE cluster_cpu_allocated_number gauge
cluster_cpu_allocated_number{cluster_name="foo"} 0.2
`
clusterCPUAllocatedGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterCPUAllocatedGauge, strings.NewReader(want), clusterCPUAllocatedMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
func TestClusterPodAllocatedMetrics(t *testing.T) {
testCluster := &clusterv1alpha1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
},
Spec: clusterv1alpha1.ClusterSpec{
SyncMode: clusterv1alpha1.Push,
},
Status: clusterv1alpha1.ClusterStatus{
Conditions: []metav1.Condition{
{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
},
ResourceSummary: &clusterv1alpha1.ResourceSummary{
Allocated: corev1.ResourceList{
corev1.ResourcePods: *resource.NewQuantity(110, resource.DecimalSI),
},
},
},
}
want := `
# HELP cluster_pod_allocated_number Number of allocated pods in the cluster.
# TYPE cluster_pod_allocated_number gauge
cluster_pod_allocated_number{cluster_name="foo"} 110
`
clusterPodAllocatedGauge.Reset()
RecordClusterStatus(testCluster)
if err := testutil.CollectAndCompare(clusterPodAllocatedGauge, strings.NewReader(want), clusterPodAllocatedMetricsName); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}