Merge pull request #1427 from Garrybest/pr_metrics

clean up scheduler metrics
2022-03-08 14:39:18 +08:00 · 2022-03-08 14:39:18 +08:00 · fece1b853c
parent beb8385c10 ab2028f341
commit fece1b853c
3 changed files with 66 additions and 98 deletions
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@ -1,73 +1,99 @@
 package metrics

 import (
-	"sync"
 	"time"

 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+
+	utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
 )

 // SchedulerSubsystem - subsystem name used by scheduler
 const SchedulerSubsystem = "karmada_scheduler"

+const (
+	scheduledResult = "scheduled"
+	errorResult     = "error"
+)
+
+const (
+	// BindingAdd is the event when a new binding is added to API server.
+	BindingAdd = "BindingAdd"
+	// BindingUpdate is the event when a new binding is updated to API server.
+	BindingUpdate = "BindingUpdate"
+	// ScheduleAttemptFailure is the event when a schedule attempt fails.
+	ScheduleAttemptFailure = "ScheduleAttemptFailure"
+	// PolicyChanged means binding needs to be rescheduled for the policy changed
+	PolicyChanged = "PolicyChanged"
+	// ClusterNotReady means binding needs to be rescheduled for cluster is not ready
+	ClusterNotReady = "ClusterNotReady"
+)
+
+const (
+	// ScheduleStepFilter means the step in generic scheduler to filter clusters
+	ScheduleStepFilter = "Filter"
+	// ScheduleStepScore means the step in generic scheduler to score clusters
+	ScheduleStepScore = "Score"
+	// ScheduleStepSelect means the step in generic scheduler to select clusters
+	ScheduleStepSelect = "Select"
+	// ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas
+	ScheduleStepAssignReplicas = "AssignReplicas"
+)
+
 var (
-	scheduleAttempts = prometheus.NewCounterVec(
+	scheduleAttempts = promauto.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: SchedulerSubsystem,
 			Name:      "schedule_attempts_total",
 			Help:      "Number of attempts to schedule resourceBinding",
-		}, []string{"result", "scheduleType"})
+		}, []string{"result", "schedule_type"})

-	e2eSchedulingLatency = prometheus.NewHistogramVec(
+	e2eSchedulingLatency = promauto.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: SchedulerSubsystem,
 			Name:      "e2e_scheduling_duration_seconds",
 			Help:      "E2e scheduling latency in seconds",
 			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 15),
-		}, []string{"result", "scheduleType"})
+		}, []string{"result", "schedule_type"})

-	schedulingAlgorithmLatency = prometheus.NewHistogramVec(
+	schedulingAlgorithmLatency = promauto.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Subsystem: SchedulerSubsystem,
 			Name:      "scheduling_algorithm_duration_seconds",
 			Help:      "Scheduling algorithm latency in seconds(exclude scale scheduler)",
 			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 15),
-		}, []string{"scheduleStep"})
+		}, []string{"schedule_step"})

-	schedulerQueueIncomingBindings = prometheus.NewCounterVec(
+	schedulerQueueIncomingBindings = promauto.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: SchedulerSubsystem,
 			Name:      "queue_incoming_bindings_total",
 			Help:      "Number of bindings added to scheduling queues by event type.",
 		}, []string{"event"})
-
-	metricsList = []prometheus.Collector{
-		scheduleAttempts,
-		e2eSchedulingLatency,
-		schedulingAlgorithmLatency,
-		schedulerQueueIncomingBindings,
-	}
 )

-var registerMetrics sync.Once
-
-// Register all metrics.
-func Register() {
-	// Register the metrics.
-	registerMetrics.Do(func() {
-		RegisterMetrics(metricsList...)
-	})
-}
-
-// RegisterMetrics registers a list of metrics.
-// This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics.
-func RegisterMetrics(extraMetrics ...prometheus.Collector) {
-	for _, metric := range extraMetrics {
-		prometheus.MustRegister(metric)
+// BindingSchedule can record a scheduling attempt and the duration
+// since `start`.
+func BindingSchedule(scheduleType string, duration float64, err error) {
+	if err != nil {
+		observeScheduleAttemptAndLatency(errorResult, scheduleType, duration)
+	} else {
+		observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration)
 	}
 }

-// SinceInSeconds gets the time since the specified start in seconds.
-func SinceInSeconds(start time.Time) float64 {
-	return time.Since(start).Seconds()
+func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) {
+	e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration)
+	scheduleAttempts.WithLabelValues(result, scheduleType).Inc()
+}
+
+// ScheduleStep can record each scheduling step duration.
+func ScheduleStep(action string, startTime time.Time) {
+	schedulingAlgorithmLatency.WithLabelValues(action).Observe(utilmetrics.DurationInSeconds(startTime))
+}
+
+// CountSchedulerBindings records the number of binding added to scheduling queues by event type.
+func CountSchedulerBindings(event string) {
+	schedulerQueueIncomingBindings.WithLabelValues(event).Inc()
 }
--- a/pkg/scheduler/metrics/metrics_record.go
+++ b/pkg/scheduler/metrics/metrics_record.go
@ -1,57 +0,0 @@
-package metrics
-
-import "time"
-
-const (
-	scheduledResult = "scheduled"
-	errorResult     = "error"
-)
-
-const (
-	// BindingAdd is the event when a new binding is added to API server.
-	BindingAdd = "BindingAdd"
-	// BindingUpdate is the event when a new binding is updated to API server.
-	BindingUpdate = "BindingUpdate"
-	// ScheduleAttemptFailure is the event when a schedule attempt fails.
-	ScheduleAttemptFailure = "ScheduleAttemptFailure"
-	// PolicyChanged means binding needs to be rescheduled for the policy changed
-	PolicyChanged = "PolicyChanged"
-	// ClusterNotReady means binding needs to be rescheduled for cluster is not ready
-	ClusterNotReady = "ClusterNotReady"
-)
-
-const (
-	// ScheduleStepFilter means the step in generic scheduler to filter clusters
-	ScheduleStepFilter = "Filter"
-	// ScheduleStepScore means the step in generic scheduler to score clusters
-	ScheduleStepScore = "Score"
-	// ScheduleStepSelect means the step in generic scheduler to select clusters
-	ScheduleStepSelect = "Select"
-	// ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas
-	ScheduleStepAssignReplicas = "AssignReplicas"
-)
-
-// BindingSchedule can record a scheduling attempt and the duration
-// since `start`.
-func BindingSchedule(scheduleType string, duration float64, err error) {
-	if err != nil {
-		observeScheduleAttemptAndLatency(errorResult, scheduleType, duration)
-	} else {
-		observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration)
-	}
-}
-
-func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) {
-	e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration)
-	scheduleAttempts.WithLabelValues(result, scheduleType).Inc()
-}
-
-// ScheduleStep can record each scheduling step duration.
-func ScheduleStep(action string, startTime time.Time) {
-	schedulingAlgorithmLatency.WithLabelValues(action).Observe(SinceInSeconds(startTime))
-}
-
-// CountSchedulerBindings records the number of binding added to scheduling queues by event type.
-func CountSchedulerBindings(event string) {
-	schedulerQueueIncomingBindings.WithLabelValues(event).Inc()
-}
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@ -37,6 +37,7 @@ import (
 	"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration"
 	"github.com/karmada-io/karmada/pkg/scheduler/metrics"
 	"github.com/karmada-io/karmada/pkg/util"
+	utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
 )

 // ScheduleType defines the schedule type of a binding object should be performed.
@ -125,8 +126,6 @@ func NewScheduler(dynamicClient dynamic.Interface, karmadaClient karmadaclientse
 		estimatorclient.RegisterSchedulerEstimator(schedulerEstimator)
 	}

-	metrics.Register()
-
 	sched.addAllEventHandlers()
 	return sched
 }
@ -282,14 +281,14 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) {
 		// policy placement changed, need schedule
 		klog.Infof("Start to schedule ResourceBinding(%s/%s) as placement changed", namespace, name)
 		err = s.scheduleResourceBinding(rb)
-		metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&rb.Spec, policyPlacement.ReplicaScheduling) {
 		// binding replicas changed, need reschedule
 		klog.Infof("Reschedule ResourceBinding(%s/%s) as replicas scaled down or scaled up", namespace, name)
 		err = s.scheduleResourceBinding(rb)
-		metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	// TODO(dddddai): reschedule bindings on cluster change
@ -301,7 +300,7 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) {
 	if features.FeatureGate.Enabled(features.Failover) {
 		klog.Infof("Reschedule ResourceBinding(%s/%s) as cluster failure or deletion", namespace, name)
 		err = s.scheduleResourceBinding(rb)
-		metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	return nil
@ -345,14 +344,14 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
 		// policy placement changed, need schedule
 		klog.Infof("Start to schedule ClusterResourceBinding(%s) as placement changed", name)
 		err = s.scheduleClusterResourceBinding(crb)
-		metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&crb.Spec, policyPlacement.ReplicaScheduling) {
 		// binding replicas changed, need reschedule
 		klog.Infof("Reschedule ClusterResourceBinding(%s) as replicas scaled down or scaled up", name)
 		err = s.scheduleClusterResourceBinding(crb)
-		metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	// TODO(dddddai): reschedule bindings on cluster change
@ -363,7 +362,7 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
 	if features.FeatureGate.Enabled(features.Failover) {
 		klog.Infof("Reschedule ClusterResourceBinding(%s) as cluster failure or deletion", name)
 		err = s.scheduleClusterResourceBinding(crb)
-		metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err)
+		metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err)
 		return err
 	}
 	return nil