karmada/pkg/scheduler/metrics/metrics.go

139 lines
5.1 KiB
Go

package metrics
import (
"time"
"github.com/prometheus/client_golang/prometheus"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
utilmetrics "github.com/karmada-io/karmada/pkg/util/metrics"
)
// SchedulerSubsystem - subsystem name used by scheduler
const SchedulerSubsystem = "karmada_scheduler"
const (
scheduledResult = "scheduled"
errorResult = "error"
)
const (
// BindingAdd is the event when a new binding is added to API server.
BindingAdd = "BindingAdd"
// BindingUpdate is the event when a new binding is updated to API server.
BindingUpdate = "BindingUpdate"
// ScheduleAttemptFailure is the event when a schedule attempt fails.
ScheduleAttemptFailure = "ScheduleAttemptFailure"
// PolicyChanged means binding needs to be rescheduled for the policy changed
PolicyChanged = "PolicyChanged"
// ClusterChanged means binding needs to be rescheduled for the cluster changed
ClusterChanged = "ClusterChanged"
)
const (
// ScheduleStepFilter means the step in generic scheduler to filter clusters
ScheduleStepFilter = "Filter"
// ScheduleStepScore means the step in generic scheduler to score clusters
ScheduleStepScore = "Score"
// ScheduleStepSelect means the step in generic scheduler to select clusters
ScheduleStepSelect = "Select"
// ScheduleStepAssignReplicas means the step in generic scheduler to assign replicas
ScheduleStepAssignReplicas = "AssignReplicas"
)
var (
scheduleAttempts = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule resourceBinding",
}, []string{"result", "schedule_type"})
e2eSchedulingLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
Help: "E2e scheduling latency in seconds",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
}, []string{"result", "schedule_type"})
schedulingAlgorithmLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduling_algorithm_duration_seconds",
Help: "Scheduling algorithm latency in seconds(exclude scale scheduler)",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
}, []string{"schedule_step"})
// SchedulerQueueIncomingBindings is the number of bindings added to scheduling queues by event type.
SchedulerQueueIncomingBindings = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "queue_incoming_bindings_total",
Help: "Number of bindings added to scheduling queues by event type.",
}, []string{"event"})
// FrameworkExtensionPointDuration is the metrics which indicates the latency for running all plugins of a specific extension point.
FrameworkExtensionPointDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "framework_extension_point_duration_seconds",
Help: "Latency for running all plugins of a specific extension point.",
// Start with 0.1ms with the last bucket being [~200ms, Inf)
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 12),
},
[]string{"extension_point", "result"})
// PluginExecutionDuration is the metrics which indicates the duration for running a plugin at a specific extension point.
PluginExecutionDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "plugin_execution_duration_seconds",
Help: "Duration for running a plugin at a specific extension point.",
// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
// so that we have better granularity since plugin latency is very sensitive.
Buckets: prometheus.ExponentialBuckets(0.00001, 1.5, 20),
},
[]string{"plugin", "extension_point", "result"})
metrics = []prometheus.Collector{
scheduleAttempts,
e2eSchedulingLatency,
schedulingAlgorithmLatency,
SchedulerQueueIncomingBindings,
FrameworkExtensionPointDuration,
PluginExecutionDuration,
}
)
func init() {
for _, m := range metrics {
ctrlmetrics.Registry.MustRegister(m)
}
}
// BindingSchedule can record a scheduling attempt and the duration
// since `start`.
func BindingSchedule(scheduleType string, duration float64, err error) {
if err != nil {
observeScheduleAttemptAndLatency(errorResult, scheduleType, duration)
} else {
observeScheduleAttemptAndLatency(scheduledResult, scheduleType, duration)
}
}
func observeScheduleAttemptAndLatency(result, scheduleType string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, scheduleType).Observe(duration)
scheduleAttempts.WithLabelValues(result, scheduleType).Inc()
}
// ScheduleStep can record each scheduling step duration.
func ScheduleStep(action string, startTime time.Time) {
schedulingAlgorithmLatency.WithLabelValues(action).Observe(utilmetrics.DurationInSeconds(startTime))
}
// CountSchedulerBindings records the number of binding added to scheduling queues by event type.
func CountSchedulerBindings(event string) {
SchedulerQueueIncomingBindings.WithLabelValues(event).Inc()
}