Merge pull request #1427 from Garrybest/pr_metrics
clean up scheduler metrics
This commit is contained in:
commit
fece1b853c
|
@ -1,73 +1,99 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
|
||||||
|
utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
// SchedulerSubsystem - subsystem name used by scheduler
|
// SchedulerSubsystem - subsystem name used by scheduler
|
||||||
const SchedulerSubsystem = "karmada_scheduler"
|
const SchedulerSubsystem = "karmada_scheduler"
|
||||||
|
|
||||||
|
const (
|
||||||
|
scheduledResult = "scheduled"
|
||||||
|
errorResult = "error"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// BindingAdd is the event when a new binding is added to API server.
|
||||||
|
BindingAdd = "BindingAdd"
|
||||||
|
// BindingUpdate is the event when a new binding is updated to API server.
|
||||||
|
BindingUpdate = "BindingUpdate"
|
||||||
|
// ScheduleAttemptFailure is the event when a schedule attempt fails.
|
||||||
|
ScheduleAttemptFailure = "ScheduleAttemptFailure"
|
||||||
|
// PolicyChanged means binding needs to be rescheduled for the policy changed
|
||||||
|
PolicyChanged = "PolicyChanged"
|
||||||
|
// ClusterNotReady means binding needs to be rescheduled for cluster is not ready
|
||||||
|
ClusterNotReady = "ClusterNotReady"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// ScheduleStepFilter means the step in generic scheduler to filter clusters
|
||||||
|
ScheduleStepFilter = "Filter"
|
||||||
|
// ScheduleStepScore means the step in generic scheduler to score clusters
|
||||||
|
ScheduleStepScore = "Score"
|
||||||
|
// ScheduleStepSelect means the step in generic scheduler to select clusters
|
||||||
|
ScheduleStepSelect = "Select"
|
||||||
|
// ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas
|
||||||
|
ScheduleStepAssignReplicas = "AssignReplicas"
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
scheduleAttempts = prometheus.NewCounterVec(
|
scheduleAttempts = promauto.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "schedule_attempts_total",
|
Name: "schedule_attempts_total",
|
||||||
Help: "Number of attempts to schedule resourceBinding",
|
Help: "Number of attempts to schedule resourceBinding",
|
||||||
}, []string{"result", "scheduleType"})
|
}, []string{"result", "schedule_type"})
|
||||||
|
|
||||||
e2eSchedulingLatency = prometheus.NewHistogramVec(
|
e2eSchedulingLatency = promauto.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "e2e_scheduling_duration_seconds",
|
Name: "e2e_scheduling_duration_seconds",
|
||||||
Help: "E2e scheduling latency in seconds",
|
Help: "E2e scheduling latency in seconds",
|
||||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
|
||||||
}, []string{"result", "scheduleType"})
|
}, []string{"result", "schedule_type"})
|
||||||
|
|
||||||
schedulingAlgorithmLatency = prometheus.NewHistogramVec(
|
schedulingAlgorithmLatency = promauto.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "scheduling_algorithm_duration_seconds",
|
Name: "scheduling_algorithm_duration_seconds",
|
||||||
Help: "Scheduling algorithm latency in seconds(exclude scale scheduler)",
|
Help: "Scheduling algorithm latency in seconds(exclude scale scheduler)",
|
||||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
|
||||||
}, []string{"scheduleStep"})
|
}, []string{"schedule_step"})
|
||||||
|
|
||||||
schedulerQueueIncomingBindings = prometheus.NewCounterVec(
|
schedulerQueueIncomingBindings = promauto.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Subsystem: SchedulerSubsystem,
|
Subsystem: SchedulerSubsystem,
|
||||||
Name: "queue_incoming_bindings_total",
|
Name: "queue_incoming_bindings_total",
|
||||||
Help: "Number of bindings added to scheduling queues by event type.",
|
Help: "Number of bindings added to scheduling queues by event type.",
|
||||||
}, []string{"event"})
|
}, []string{"event"})
|
||||||
|
|
||||||
metricsList = []prometheus.Collector{
|
|
||||||
scheduleAttempts,
|
|
||||||
e2eSchedulingLatency,
|
|
||||||
schedulingAlgorithmLatency,
|
|
||||||
schedulerQueueIncomingBindings,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var registerMetrics sync.Once
|
// BindingSchedule can record a scheduling attempt and the duration
|
||||||
|
// since `start`.
|
||||||
// Register all metrics.
|
func BindingSchedule(scheduleType string, duration float64, err error) {
|
||||||
func Register() {
|
if err != nil {
|
||||||
// Register the metrics.
|
observeScheduleAttemptAndLatency(errorResult, scheduleType, duration)
|
||||||
registerMetrics.Do(func() {
|
} else {
|
||||||
RegisterMetrics(metricsList...)
|
observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration)
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// RegisterMetrics registers a list of metrics.
|
|
||||||
// This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics.
|
|
||||||
func RegisterMetrics(extraMetrics ...prometheus.Collector) {
|
|
||||||
for _, metric := range extraMetrics {
|
|
||||||
prometheus.MustRegister(metric)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SinceInSeconds gets the time since the specified start in seconds.
|
func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) {
|
||||||
func SinceInSeconds(start time.Time) float64 {
|
e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration)
|
||||||
return time.Since(start).Seconds()
|
scheduleAttempts.WithLabelValues(result, scheduleType).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScheduleStep can record each scheduling step duration.
|
||||||
|
func ScheduleStep(action string, startTime time.Time) {
|
||||||
|
schedulingAlgorithmLatency.WithLabelValues(action).Observe(utilmetrics.DurationInSeconds(startTime))
|
||||||
|
}
|
||||||
|
|
||||||
|
// CountSchedulerBindings records the number of binding added to scheduling queues by event type.
|
||||||
|
func CountSchedulerBindings(event string) {
|
||||||
|
schedulerQueueIncomingBindings.WithLabelValues(event).Inc()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
package metrics
|
|
||||||
|
|
||||||
import "time"
|
|
||||||
|
|
||||||
const (
|
|
||||||
scheduledResult = "scheduled"
|
|
||||||
errorResult = "error"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
// BindingAdd is the event when a new binding is added to API server.
|
|
||||||
BindingAdd = "BindingAdd"
|
|
||||||
// BindingUpdate is the event when a new binding is updated to API server.
|
|
||||||
BindingUpdate = "BindingUpdate"
|
|
||||||
// ScheduleAttemptFailure is the event when a schedule attempt fails.
|
|
||||||
ScheduleAttemptFailure = "ScheduleAttemptFailure"
|
|
||||||
// PolicyChanged means binding needs to be rescheduled for the policy changed
|
|
||||||
PolicyChanged = "PolicyChanged"
|
|
||||||
// ClusterNotReady means binding needs to be rescheduled for cluster is not ready
|
|
||||||
ClusterNotReady = "ClusterNotReady"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
// ScheduleStepFilter means the step in generic scheduler to filter clusters
|
|
||||||
ScheduleStepFilter = "Filter"
|
|
||||||
// ScheduleStepScore means the step in generic scheduler to score clusters
|
|
||||||
ScheduleStepScore = "Score"
|
|
||||||
// ScheduleStepSelect means the step in generic scheduler to select clusters
|
|
||||||
ScheduleStepSelect = "Select"
|
|
||||||
// ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas
|
|
||||||
ScheduleStepAssignReplicas = "AssignReplicas"
|
|
||||||
)
|
|
||||||
|
|
||||||
// BindingSchedule can record a scheduling attempt and the duration
|
|
||||||
// since `start`.
|
|
||||||
func BindingSchedule(scheduleType string, duration float64, err error) {
|
|
||||||
if err != nil {
|
|
||||||
observeScheduleAttemptAndLatency(errorResult, scheduleType, duration)
|
|
||||||
} else {
|
|
||||||
observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) {
|
|
||||||
e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration)
|
|
||||||
scheduleAttempts.WithLabelValues(result, scheduleType).Inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
// ScheduleStep can record each scheduling step duration.
|
|
||||||
func ScheduleStep(action string, startTime time.Time) {
|
|
||||||
schedulingAlgorithmLatency.WithLabelValues(action).Observe(SinceInSeconds(startTime))
|
|
||||||
}
|
|
||||||
|
|
||||||
// CountSchedulerBindings records the number of binding added to scheduling queues by event type.
|
|
||||||
func CountSchedulerBindings(event string) {
|
|
||||||
schedulerQueueIncomingBindings.WithLabelValues(event).Inc()
|
|
||||||
}
|
|
|
@ -37,6 +37,7 @@ import (
|
||||||
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration"
|
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration"
|
||||||
"github.com/karmada-io/karmada/pkg/scheduler/metrics"
|
"github.com/karmada-io/karmada/pkg/scheduler/metrics"
|
||||||
"github.com/karmada-io/karmada/pkg/util"
|
"github.com/karmada-io/karmada/pkg/util"
|
||||||
|
utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ScheduleType defines the schedule type of a binding object should be performed.
|
// ScheduleType defines the schedule type of a binding object should be performed.
|
||||||
|
@ -125,8 +126,6 @@ func NewScheduler(dynamicClient dynamic.Interface, karmadaClient karmadaclientse
|
||||||
estimatorclient.RegisterSchedulerEstimator(schedulerEstimator)
|
estimatorclient.RegisterSchedulerEstimator(schedulerEstimator)
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.Register()
|
|
||||||
|
|
||||||
sched.addAllEventHandlers()
|
sched.addAllEventHandlers()
|
||||||
return sched
|
return sched
|
||||||
}
|
}
|
||||||
|
@ -282,14 +281,14 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) {
|
||||||
// policy placement changed, need schedule
|
// policy placement changed, need schedule
|
||||||
klog.Infof("Start to schedule ResourceBinding(%s/%s) as placement changed", namespace, name)
|
klog.Infof("Start to schedule ResourceBinding(%s/%s) as placement changed", namespace, name)
|
||||||
err = s.scheduleResourceBinding(rb)
|
err = s.scheduleResourceBinding(rb)
|
||||||
metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&rb.Spec, policyPlacement.ReplicaScheduling) {
|
if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&rb.Spec, policyPlacement.ReplicaScheduling) {
|
||||||
// binding replicas changed, need reschedule
|
// binding replicas changed, need reschedule
|
||||||
klog.Infof("Reschedule ResourceBinding(%s/%s) as replicas scaled down or scaled up", namespace, name)
|
klog.Infof("Reschedule ResourceBinding(%s/%s) as replicas scaled down or scaled up", namespace, name)
|
||||||
err = s.scheduleResourceBinding(rb)
|
err = s.scheduleResourceBinding(rb)
|
||||||
metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// TODO(dddddai): reschedule bindings on cluster change
|
// TODO(dddddai): reschedule bindings on cluster change
|
||||||
|
@ -301,7 +300,7 @@ func (s *Scheduler) doScheduleBinding(namespace, name string) (err error) {
|
||||||
if features.FeatureGate.Enabled(features.Failover) {
|
if features.FeatureGate.Enabled(features.Failover) {
|
||||||
klog.Infof("Reschedule ResourceBinding(%s/%s) as cluster failure or deletion", namespace, name)
|
klog.Infof("Reschedule ResourceBinding(%s/%s) as cluster failure or deletion", namespace, name)
|
||||||
err = s.scheduleResourceBinding(rb)
|
err = s.scheduleResourceBinding(rb)
|
||||||
metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
@ -345,14 +344,14 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
|
||||||
// policy placement changed, need schedule
|
// policy placement changed, need schedule
|
||||||
klog.Infof("Start to schedule ClusterResourceBinding(%s) as placement changed", name)
|
klog.Infof("Start to schedule ClusterResourceBinding(%s) as placement changed", name)
|
||||||
err = s.scheduleClusterResourceBinding(crb)
|
err = s.scheduleClusterResourceBinding(crb)
|
||||||
metrics.BindingSchedule(string(ReconcileSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(ReconcileSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&crb.Spec, policyPlacement.ReplicaScheduling) {
|
if policyPlacement.ReplicaScheduling != nil && util.IsBindingReplicasChanged(&crb.Spec, policyPlacement.ReplicaScheduling) {
|
||||||
// binding replicas changed, need reschedule
|
// binding replicas changed, need reschedule
|
||||||
klog.Infof("Reschedule ClusterResourceBinding(%s) as replicas scaled down or scaled up", name)
|
klog.Infof("Reschedule ClusterResourceBinding(%s) as replicas scaled down or scaled up", name)
|
||||||
err = s.scheduleClusterResourceBinding(crb)
|
err = s.scheduleClusterResourceBinding(crb)
|
||||||
metrics.BindingSchedule(string(ScaleSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(ScaleSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// TODO(dddddai): reschedule bindings on cluster change
|
// TODO(dddddai): reschedule bindings on cluster change
|
||||||
|
@ -363,7 +362,7 @@ func (s *Scheduler) doScheduleClusterBinding(name string) (err error) {
|
||||||
if features.FeatureGate.Enabled(features.Failover) {
|
if features.FeatureGate.Enabled(features.Failover) {
|
||||||
klog.Infof("Reschedule ClusterResourceBinding(%s) as cluster failure or deletion", name)
|
klog.Infof("Reschedule ClusterResourceBinding(%s) as cluster failure or deletion", name)
|
||||||
err = s.scheduleClusterResourceBinding(crb)
|
err = s.scheduleClusterResourceBinding(crb)
|
||||||
metrics.BindingSchedule(string(FailoverSchedule), metrics.SinceInSeconds(start), err)
|
metrics.BindingSchedule(string(FailoverSchedule), utilmetrics.DurationInSeconds(start), err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
Loading…
Reference in New Issue