Merge pull request #3972 from zhy76/metric

feat: add performance metrics for FederatedHPA
2023-08-26 16:23:05 +08:00 · 2023-08-26 16:23:05 +08:00 · 758a6ceaf0
parent 5135e8fea3 d64894c8a6
commit 758a6ceaf0
3 changed files with 63 additions and 10 deletions
--- a/pkg/controllers/federatedhpa/federatedhpa_controller.go
+++ b/pkg/controllers/federatedhpa/federatedhpa_controller.go
@ -35,6 +35,7 @@ import (
 	policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
 	workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
 	"github.com/karmada-io/karmada/pkg/controllers/federatedhpa/monitor"
+	"github.com/karmada-io/karmada/pkg/metrics"
 	"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
 	"github.com/karmada-io/karmada/pkg/util"
 	"github.com/karmada-io/karmada/pkg/util/fedinformer/typedmanager"
@ -155,7 +156,12 @@ func (c *FederatedHPAController) Reconcile(ctx context.Context, req controllerru
 	}
 	c.hpaSelectorsMux.Unlock()

-	err := c.reconcileAutoscaler(ctx, hpa)
+	// observe process FederatedHPA latency
+	var err error
+	startTime := time.Now()
+	defer metrics.ObserveProcessFederatedHPALatency(err, startTime)
+
+	err = c.reconcileAutoscaler(ctx, hpa)
 	if err != nil {
 		return controllerruntime.Result{}, err
 	}
--- a/pkg/controllers/federatedhpa/metrics/client.go
+++ b/pkg/controllers/federatedhpa/metrics/client.go
@ -32,6 +32,8 @@ import (
 	resourceclient "k8s.io/metrics/pkg/client/clientset/versioned/typed/metrics/v1beta1"
 	customclient "k8s.io/metrics/pkg/client/custom_metrics"
 	externalclient "k8s.io/metrics/pkg/client/external_metrics"
+
+	"github.com/karmada-io/karmada/pkg/metrics"
 )

 const (
@ -64,6 +66,11 @@ type resourceMetricsClient struct {
 // GetResourceMetric gets the given resource metric (and an associated oldest timestamp)
 // for all pods matching the specified selector in the given namespace
 func (c *resourceMetricsClient) GetResourceMetric(ctx context.Context, resource corev1.ResourceName, namespace string, selector labels.Selector, container string) (PodMetricsInfo, time.Time, error) {
+	// observe pull ResourceMetric latency
+	var err error
+	startTime := time.Now()
+	defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ResourceMetric", startTime)
+
 	metrics, err := c.client.PodMetricses(namespace).List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
 	if err != nil {
 		return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from resource metrics API: %v", err)
@ -143,6 +150,11 @@ type customMetricsClient struct {
 // GetRawMetric gets the given metric (and an associated oldest timestamp)
 // for all pods matching the specified selector in the given namespace
 func (c *customMetricsClient) GetRawMetric(metricName string, namespace string, selector labels.Selector, metricSelector labels.Selector) (PodMetricsInfo, time.Time, error) {
+	// observe pull RawMetric latency
+	var err error
+	startTime := time.Now()
+	defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "RawMetric", startTime)
+
 	metrics, err := c.client.NamespacedMetrics(namespace).GetForObjects(schema.GroupKind{Kind: "Pod"}, selector, metricName, metricSelector)
 	if err != nil {
 		return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from custom metrics API: %v", err)
@ -175,9 +187,13 @@ func (c *customMetricsClient) GetRawMetric(metricName string, namespace string,
 // GetObjectMetric gets the given metric (and an associated timestamp) for the given
 // object in the given namespace
 func (c *customMetricsClient) GetObjectMetric(metricName string, namespace string, objectRef *autoscalingv2.CrossVersionObjectReference, metricSelector labels.Selector) (int64, time.Time, error) {
+	// observe pull ObjectMetric latency
+	var err error
+	startTime := time.Now()
+	defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ObjectMetric", startTime)
+
 	gvk := schema.FromAPIVersionAndKind(objectRef.APIVersion, objectRef.Kind)
 	var metricValue *customapi.MetricValue
-	var err error
 	if gvk.Kind == "Namespace" && gvk.Group == "" {
 		// handle namespace separately
 		// NB: we ignore namespace name here, since CrossVersionObjectReference isn't
@ -203,6 +219,11 @@ type externalMetricsClient struct {
 // GetExternalMetric gets all the values of a given external metric
 // that match the specified selector.
 func (c *externalMetricsClient) GetExternalMetric(metricName, namespace string, selector labels.Selector) ([]int64, time.Time, error) {
+	// observe pull ExternalMetric latency
+	var err error
+	startTime := time.Now()
+	defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ExternalMetric", startTime)
+
 	metrics, err := c.client.NamespacedMetrics(namespace).List(metricName, selector)
 	if err != nil {
 		return []int64{}, time.Time{}, fmt.Errorf("unable to fetch metrics from external metrics API: %v", err)
--- a/pkg/metrics/resource.go
+++ b/pkg/metrics/resource.go
@ -9,14 +9,16 @@ import (
 )

 const (
-	resourceMatchPolicyDurationMetricsName  = "resource_match_policy_duration_seconds"
-	resourceApplyPolicyDurationMetricsName  = "resource_apply_policy_duration_seconds"
-	policyApplyAttemptsMetricsName          = "policy_apply_attempts_total"
-	syncWorkDurationMetricsName             = "binding_sync_work_duration_seconds"
-	syncWorkloadDurationMetricsName         = "work_sync_workload_duration_seconds"
-	policyPreemptionMetricsName             = "policy_preemption_total"
-	cronFederatedHPADurationMetricsName     = "cronfederatedhpa_process_duration_seconds"
-	cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
+	resourceMatchPolicyDurationMetricsName     = "resource_match_policy_duration_seconds"
+	resourceApplyPolicyDurationMetricsName     = "resource_apply_policy_duration_seconds"
+	policyApplyAttemptsMetricsName             = "policy_apply_attempts_total"
+	syncWorkDurationMetricsName                = "binding_sync_work_duration_seconds"
+	syncWorkloadDurationMetricsName            = "work_sync_workload_duration_seconds"
+	policyPreemptionMetricsName                = "policy_preemption_total"
+	cronFederatedHPADurationMetricsName        = "cronfederatedhpa_process_duration_seconds"
+	cronFederatedHPARuleDurationMetricsName    = "cronfederatedhpa_rule_process_duration_seconds"
+	federatedHPADurationMetricsName            = "federatedhpa_process_duration_seconds"
+	federatedHPAPullMetricsDurationMetricsName = "federatedhpa_pull_metrics_duration_seconds"
 )

 var (
@ -65,6 +67,18 @@ var (
 		Help:    "Duration in seconds to process a CronFederatedHPA rule. By the result, 'error' means a CronFederatedHPA rule failed to be processed. Otherwise 'success'.",
 		Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
 	}, []string{"result"})
+
+	federatedHPADurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Name:    federatedHPADurationMetricsName,
+		Help:    "Duration in seconds to process a FederatedHPA. By the result, 'error' means a FederatedHPA failed to be processed. Otherwise 'success'.",
+		Buckets: prometheus.ExponentialBuckets(0.01, 2, 12),
+	}, []string{"result"})
+
+	federatedHPAPullMetricsDurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Name:    federatedHPAPullMetricsDurationMetricsName,
+		Help:    "Duration in seconds taken by the FederatedHPA to pull metrics. By the result, 'error' means the FederatedHPA failed to pull the metrics. Otherwise 'success'.",
+		Buckets: prometheus.ExponentialBuckets(0.01, 2, 12),
+	}, []string{"result", "metricType"})
 )

 // ObserveFindMatchedPolicyLatency records the duration for the resource finding a matched policy.
@ -103,6 +117,16 @@ func ObserveProcessCronFederatedHPARuleLatency(err error, start time.Time) {
 	cronFederatedHPARuleDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
 }

+// ObserveProcessFederatedHPALatency records the duration to process a FederatedHPA.
+func ObserveProcessFederatedHPALatency(err error, start time.Time) {
+	federatedHPADurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
+}
+
+// ObserveFederatedHPAPullMetricsLatency records the duration it takes for the FederatedHPA to pull metrics.
+func ObserveFederatedHPAPullMetricsLatency(err error, metricType string, start time.Time) {
+	federatedHPAPullMetricsDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err), metricType).Observe(utilmetrics.DurationInSeconds(start))
+}
+
 // ResourceCollectors returns the collectors about resources.
 func ResourceCollectors() []prometheus.Collector {
 	return []prometheus.Collector{
@ -114,6 +138,8 @@ func ResourceCollectors() []prometheus.Collector {
 		policyPreemptionCounter,
 		cronFederatedHPADurationHistogram,
 		cronFederatedHPARuleDurationHistogram,
+		federatedHPADurationHistogram,
+		federatedHPAPullMetricsDurationHistogram,
 	}
 }