diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index a0a2cfd91..6bc311f06 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -1,73 +1,99 @@ package metrics import ( - "sync" "time" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + + utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics" ) // SchedulerSubsystem - subsystem name used by scheduler const SchedulerSubsystem = "karmada_scheduler" +const ( + scheduledResult = "scheduled" + errorResult = "error" +) + +const ( + // BindingAdd is the event when a new binding is added to API server. + BindingAdd = "BindingAdd" + // BindingUpdate is the event when a new binding is updated to API server. + BindingUpdate = "BindingUpdate" + // ScheduleAttemptFailure is the event when a schedule attempt fails. + ScheduleAttemptFailure = "ScheduleAttemptFailure" + // PolicyChanged means binding needs to be rescheduled for the policy changed + PolicyChanged = "PolicyChanged" + // ClusterNotReady means binding needs to be rescheduled for cluster is not ready + ClusterNotReady = "ClusterNotReady" +) + +const ( + // ScheduleStepFilter means the step in generic scheduler to filter clusters + ScheduleStepFilter = "Filter" + // ScheduleStepScore means the step in generic scheduler to score clusters + ScheduleStepScore = "Score" + // ScheduleStepSelect means the step in generic scheduler to select clusters + ScheduleStepSelect = "Select" + // ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas + ScheduleStepAssignReplicas = "AssignReplicas" +) + var ( - scheduleAttempts = prometheus.NewCounterVec( + scheduleAttempts = promauto.NewCounterVec( prometheus.CounterOpts{ Subsystem: SchedulerSubsystem, Name: "schedule_attempts_total", Help: "Number of attempts to schedule resourceBinding", - }, []string{"result", "scheduleType"}) + }, []string{"result", "schedule_type"}) - e2eSchedulingLatency = prometheus.NewHistogramVec( + e2eSchedulingLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "e2e_scheduling_duration_seconds", Help: "E2e scheduling latency in seconds", Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), - }, []string{"result", "scheduleType"}) + }, []string{"result", "schedule_type"}) - schedulingAlgorithmLatency = prometheus.NewHistogramVec( + schedulingAlgorithmLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: SchedulerSubsystem, Name: "scheduling_algorithm_duration_seconds", Help: "Scheduling algorithm latency in seconds(exclude scale scheduler)", Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), - }, []string{"scheduleStep"}) + }, []string{"schedule_step"}) - schedulerQueueIncomingBindings = prometheus.NewCounterVec( + schedulerQueueIncomingBindings = promauto.NewCounterVec( prometheus.CounterOpts{ Subsystem: SchedulerSubsystem, Name: "queue_incoming_bindings_total", Help: "Number of bindings added to scheduling queues by event type.", }, []string{"event"}) - - metricsList = []prometheus.Collector{ - scheduleAttempts, - e2eSchedulingLatency, - schedulingAlgorithmLatency, - schedulerQueueIncomingBindings, - } ) -var registerMetrics sync.Once - -// Register all metrics. -func Register() { - // Register the metrics. - registerMetrics.Do(func() { - RegisterMetrics(metricsList...) - }) -} - -// RegisterMetrics registers a list of metrics. -// This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics. -func RegisterMetrics(extraMetrics ...prometheus.Collector) { - for _, metric := range extraMetrics { - prometheus.MustRegister(metric) +// BindingSchedule can record a scheduling attempt and the duration +// since `start`. +func BindingSchedule(scheduleType string, duration float64, err error) { + if err != nil { + observeScheduleAttemptAndLatency(errorResult, scheduleType, duration) + } else { + observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration) } } -// SinceInSeconds gets the time since the specified start in seconds. -func SinceInSeconds(start time.Time) float64 { - return time.Since(start).Seconds() +func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) { + e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration) + scheduleAttempts.WithLabelValues(result, scheduleType).Inc() +} + +// ScheduleStep can record each scheduling step duration. +func ScheduleStep(action string, startTime time.Time) { + schedulingAlgorithmLatency.WithLabelValues(action).Observe(utilmetrics.DurationInSeconds(startTime)) +} + +// CountSchedulerBindings records the number of binding added to scheduling queues by event type. +func CountSchedulerBindings(event string) { + schedulerQueueIncomingBindings.WithLabelValues(event).Inc() } diff --git a/pkg/scheduler/metrics/metrics_record.go b/pkg/scheduler/metrics/metrics_record.go deleted file mode 100644 index 3f18fa0cf..000000000 --- a/pkg/scheduler/metrics/metrics_record.go +++ /dev/null @@ -1,57 +0,0 @@ -package metrics - -import "time" - -const ( - scheduledResult = "scheduled" - errorResult = "error" -) - -const ( - // BindingAdd is the event when a new binding is added to API server. - BindingAdd = "BindingAdd" - // BindingUpdate is the event when a new binding is updated to API server. - BindingUpdate = "BindingUpdate" - // ScheduleAttemptFailure is the event when a schedule attempt fails. - ScheduleAttemptFailure = "ScheduleAttemptFailure" - // PolicyChanged means binding needs to be rescheduled for the policy changed - PolicyChanged = "PolicyChanged" - // ClusterNotReady means binding needs to be rescheduled for cluster is not ready - ClusterNotReady = "ClusterNotReady" -) - -const ( - // ScheduleStepFilter means the step in generic scheduler to filter clusters - ScheduleStepFilter = "Filter" - // ScheduleStepScore means the step in generic scheduler to score clusters - ScheduleStepScore = "Score" - // ScheduleStepSelect means the step in generic scheduler to select clusters - ScheduleStepSelect = "Select" - // ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas - ScheduleStepAssignReplicas = "AssignReplicas" -) - -// BindingSchedule can record a scheduling attempt and the duration -// since `start`. -func BindingSchedule(scheduleType string, duration float64, err error) { - if err != nil { - observeScheduleAttemptAndLatency(errorResult, scheduleType, duration) - } else { - observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration) - } -} - -func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) { - e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration) - scheduleAttempts.WithLabelValues(result, scheduleType).Inc() -} - -// ScheduleStep can record each scheduling step duration. -func ScheduleStep(action string, startTime time.Time) { - schedulingAlgorithmLatency.WithLabelValues(action).Observe(SinceInSeconds(startTime)) -} - -// CountSchedulerBindings records the number of binding added to scheduling queues by event type. -func CountSchedulerBindings(event string) { - schedulerQueueIncomingBindings.WithLabelValues(event).Inc() -} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index f070d36d3..2e3aa4de1 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -37,6 +37,7 @@ import ( "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration" "github.com/karmada-io/karmada/pkg/scheduler/metrics" "github.com/karmada-io/karmada/pkg/util" + utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics" ) // ScheduleType defines the schedule type of a binding object should be performed. @@ -125,8 +126,6 @@ func NewScheduler(dynamicClient dynamic.Interface, karmadaClient karmadaclientse estimatorclient.RegisterSchedulerEstimator(schedulerEstimator) } - metrics.Register() - sched.addAllEventHandlers() return sched } @@ -282,14 +281,14 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) { // policy placement changed, need schedule klog.Infof("Start to schedule ResourceBinding(%s/%s) as placement changed", namespace, name) err = s.scheduleResourceBinding(rb) - metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err) return err } if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&rb.Spec, policyPlacement.ReplicaScheduling) { // binding replicas changed, need reschedule klog.Infof("Reschedule ResourceBinding(%s/%s) as replicas scaled down or scaled up", namespace, name) err = s.scheduleResourceBinding(rb) - metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err) return err } // TODO(dddddai): reschedule bindings on cluster change @@ -301,7 +300,7 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) { if features.FeatureGate.Enabled(features.Failover) { klog.Infof("Reschedule ResourceBinding(%s/%s) as cluster failure or deletion", namespace, name) err = s.scheduleResourceBinding(rb) - metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err) return err } return nil @@ -345,14 +344,14 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) { // policy placement changed, need schedule klog.Infof("Start to schedule ClusterResourceBinding(%s) as placement changed", name) err = s.scheduleClusterResourceBinding(crb) - metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err) return err } if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&crb.Spec, policyPlacement.ReplicaScheduling) { // binding replicas changed, need reschedule klog.Infof("Reschedule ClusterResourceBinding(%s) as replicas scaled down or scaled up", name) err = s.scheduleClusterResourceBinding(crb) - metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err) return err } // TODO(dddddai): reschedule bindings on cluster change @@ -363,7 +362,7 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) { if features.FeatureGate.Enabled(features.Failover) { klog.Infof("Reschedule ClusterResourceBinding(%s) as cluster failure or deletion", name) err = s.scheduleClusterResourceBinding(crb) - metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err) + metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err) return err } return nil