Added CA metrics related to autoscaler execution
This commit is contained in:
parent
e17f3507d4
commit
4cdf06ea94
|
|
@ -58,7 +58,7 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
|
|||
// RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
|
||||
func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("reconfigure")
|
||||
metrics.UpdateLastTime("reconfigure", reconfigureStart)
|
||||
if err := a.Reconfigure(); err != nil {
|
||||
glog.Errorf("Failed to reconfigure : %v", err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ func (a *PollingAutoscaler) ExitCleanUp() {
|
|||
// RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
|
||||
func (a *PollingAutoscaler) RunOnce(currentTime time.Time) {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("poll")
|
||||
metrics.UpdateLastTime("poll", reconfigureStart)
|
||||
if err := a.Poll(); err != nil {
|
||||
glog.Errorf("Failed to poll : %v", err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,6 +80,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
pdbLister := a.PodDisruptionBudgetLister()
|
||||
scaleDown := a.scaleDown
|
||||
autoscalingContext := a.AutoscalingContext
|
||||
runStart := time.Now()
|
||||
|
||||
readyNodes, err := readyNodeLister.List()
|
||||
if err != nil {
|
||||
|
|
@ -124,6 +125,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
return
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("updateClusterState", runStart)
|
||||
metrics.UpdateLastTime("autoscaling", time.Now())
|
||||
|
||||
// Check if there are any nodes that failed to register in kuberentes
|
||||
// master.
|
||||
unregisteredNodes := a.ClusterStateRegistry.GetUnregisteredNodes()
|
||||
|
|
@ -215,7 +219,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
glog.V(1).Info("Max total nodes in cluster reached")
|
||||
} else {
|
||||
scaleUpStart := time.Now()
|
||||
metrics.UpdateLastTime("scaleup")
|
||||
metrics.UpdateLastTime("scaleUp", scaleUpStart)
|
||||
scaledUp, err := ScaleUp(autoscalingContext, unschedulablePodsToHelp, readyNodes)
|
||||
|
||||
metrics.UpdateDuration("scaleup", scaleUpStart)
|
||||
|
|
@ -248,7 +252,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
|
||||
a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent)
|
||||
|
||||
metrics.UpdateLastTime("findUnneeded")
|
||||
glog.V(4).Infof("Calculating unneeded nodes")
|
||||
|
||||
scaleDown.CleanUp(time.Now())
|
||||
|
|
@ -270,9 +273,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
glog.V(4).Infof("Starting scale down")
|
||||
|
||||
scaleDownStart := time.Now()
|
||||
metrics.UpdateLastTime("scaledown")
|
||||
metrics.UpdateLastTime("scaleDown", scaleDownStart)
|
||||
result, err := scaleDown.TryToScaleDown(allNodes, allScheduled, pdbs)
|
||||
metrics.UpdateDuration("scaledown", scaleDownStart)
|
||||
metrics.UpdateDuration("scaleDown", scaleDownStart)
|
||||
|
||||
// TODO: revisit result handling
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ func run(_ <-chan struct{}) {
|
|||
case <-time.After(*scanInterval):
|
||||
{
|
||||
loopStart := time.Now()
|
||||
metrics.UpdateLastTime("main")
|
||||
metrics.UpdateLastTime("main", loopStart)
|
||||
|
||||
autoscaler.RunOnce(loopStart)
|
||||
|
||||
|
|
|
|||
|
|
@ -58,53 +58,44 @@ var (
|
|||
},
|
||||
)
|
||||
|
||||
lastTimestamp = prometheus.NewGaugeVec(
|
||||
/**** Metrics related to autoscaler execution ****/
|
||||
lastActivity = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "last_time_seconds",
|
||||
Help: "Last time CA run some main loop fragment.",
|
||||
}, []string{"main"},
|
||||
Name: "last_activity",
|
||||
Help: "Last time certain part of CA logic executed.",
|
||||
}, []string{"activity"},
|
||||
)
|
||||
|
||||
lastDuration = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "last_duration_microseconds",
|
||||
Help: "Time spent in last main loop fragments in microseconds.",
|
||||
}, []string{"main"},
|
||||
)
|
||||
|
||||
duration = prometheus.NewSummaryVec(
|
||||
functionDuration = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "duration_microseconds",
|
||||
Help: "Time spent in main loop fragments in microseconds.",
|
||||
}, []string{"main"},
|
||||
Name: "function_duration_seconds",
|
||||
Help: "Time taken by various parts of CA main loop.",
|
||||
}, []string{"function"},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(duration)
|
||||
prometheus.MustRegister(lastDuration)
|
||||
prometheus.MustRegister(lastTimestamp)
|
||||
prometheus.MustRegister(clusterSafeToAutoscale)
|
||||
prometheus.MustRegister(nodesCount)
|
||||
prometheus.MustRegister(unschedulablePodsCount)
|
||||
prometheus.MustRegister(lastActivity)
|
||||
prometheus.MustRegister(functionDuration)
|
||||
}
|
||||
|
||||
func durationToMicro(start time.Time) float64 {
|
||||
return float64(time.Now().Sub(start).Nanoseconds() / 1000)
|
||||
func getDuration(start time.Time) float64 {
|
||||
return time.Now().Sub(start).Seconds()
|
||||
}
|
||||
|
||||
// UpdateDuration records the duration of the step identified by the label
|
||||
func UpdateDuration(label string, start time.Time) {
|
||||
duration.WithLabelValues(label).Observe(durationToMicro(start))
|
||||
lastDuration.WithLabelValues(label).Set(durationToMicro(start))
|
||||
functionDuration.WithLabelValues(label).Observe(getDuration(start))
|
||||
}
|
||||
|
||||
// UpdateLastTime records the time the step identified by the label was started
|
||||
func UpdateLastTime(label string) {
|
||||
lastTimestamp.WithLabelValues(label).Set(float64(time.Now().Unix()))
|
||||
func UpdateLastTime(label string, now time.Time) {
|
||||
lastActivity.WithLabelValues(label).Set(float64(now.Unix()))
|
||||
}
|
||||
|
||||
// UpdateClusterState updates metrics related to cluster state
|
||||
|
|
|
|||
Loading…
Reference in New Issue