Added CA metrics related to autoscaler execution

This commit is contained in:
Maciej Pytel 2017-05-11 14:28:49 +02:00
parent e17f3507d4
commit 4cdf06ea94
5 changed files with 26 additions and 32 deletions

View File

@ -58,7 +58,7 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
// RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) {
reconfigureStart := time.Now()
metrics.UpdateLastTime("reconfigure")
metrics.UpdateLastTime("reconfigure", reconfigureStart)
if err := a.Reconfigure(); err != nil {
glog.Errorf("Failed to reconfigure : %v", err)
}

View File

@ -50,7 +50,7 @@ func (a *PollingAutoscaler) ExitCleanUp() {
// RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
func (a *PollingAutoscaler) RunOnce(currentTime time.Time) {
reconfigureStart := time.Now()
metrics.UpdateLastTime("poll")
metrics.UpdateLastTime("poll", reconfigureStart)
if err := a.Poll(); err != nil {
glog.Errorf("Failed to poll : %v", err)
}

View File

@ -80,6 +80,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
pdbLister := a.PodDisruptionBudgetLister()
scaleDown := a.scaleDown
autoscalingContext := a.AutoscalingContext
runStart := time.Now()
readyNodes, err := readyNodeLister.List()
if err != nil {
@ -124,6 +125,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
return
}
metrics.UpdateDuration("updateClusterState", runStart)
metrics.UpdateLastTime("autoscaling", time.Now())
// Check if there are any nodes that failed to register in kuberentes
// master.
unregisteredNodes := a.ClusterStateRegistry.GetUnregisteredNodes()
@ -215,7 +219,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
glog.V(1).Info("Max total nodes in cluster reached")
} else {
scaleUpStart := time.Now()
metrics.UpdateLastTime("scaleup")
metrics.UpdateLastTime("scaleUp", scaleUpStart)
scaledUp, err := ScaleUp(autoscalingContext, unschedulablePodsToHelp, readyNodes)
metrics.UpdateDuration("scaleup", scaleUpStart)
@ -248,7 +252,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent)
metrics.UpdateLastTime("findUnneeded")
glog.V(4).Infof("Calculating unneeded nodes")
scaleDown.CleanUp(time.Now())
@ -270,9 +273,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
glog.V(4).Infof("Starting scale down")
scaleDownStart := time.Now()
metrics.UpdateLastTime("scaledown")
metrics.UpdateLastTime("scaleDown", scaleDownStart)
result, err := scaleDown.TryToScaleDown(allNodes, allScheduled, pdbs)
metrics.UpdateDuration("scaledown", scaleDownStart)
metrics.UpdateDuration("scaleDown", scaleDownStart)
// TODO: revisit result handling
if err != nil {

View File

@ -189,7 +189,7 @@ func run(_ <-chan struct{}) {
case <-time.After(*scanInterval):
{
loopStart := time.Now()
metrics.UpdateLastTime("main")
metrics.UpdateLastTime("main", loopStart)
autoscaler.RunOnce(loopStart)

View File

@ -58,53 +58,44 @@ var (
},
)
lastTimestamp = prometheus.NewGaugeVec(
/**** Metrics related to autoscaler execution ****/
lastActivity = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: caNamespace,
Name: "last_time_seconds",
Help: "Last time CA run some main loop fragment.",
}, []string{"main"},
Name: "last_activity",
Help: "Last time certain part of CA logic executed.",
}, []string{"activity"},
)
lastDuration = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: caNamespace,
Name: "last_duration_microseconds",
Help: "Time spent in last main loop fragments in microseconds.",
}, []string{"main"},
)
duration = prometheus.NewSummaryVec(
functionDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: caNamespace,
Name: "duration_microseconds",
Help: "Time spent in main loop fragments in microseconds.",
}, []string{"main"},
Name: "function_duration_seconds",
Help: "Time taken by various parts of CA main loop.",
}, []string{"function"},
)
)
func init() {
prometheus.MustRegister(duration)
prometheus.MustRegister(lastDuration)
prometheus.MustRegister(lastTimestamp)
prometheus.MustRegister(clusterSafeToAutoscale)
prometheus.MustRegister(nodesCount)
prometheus.MustRegister(unschedulablePodsCount)
prometheus.MustRegister(lastActivity)
prometheus.MustRegister(functionDuration)
}
func durationToMicro(start time.Time) float64 {
return float64(time.Now().Sub(start).Nanoseconds() / 1000)
func getDuration(start time.Time) float64 {
return time.Now().Sub(start).Seconds()
}
// UpdateDuration records the duration of the step identified by the label
func UpdateDuration(label string, start time.Time) {
duration.WithLabelValues(label).Observe(durationToMicro(start))
lastDuration.WithLabelValues(label).Set(durationToMicro(start))
functionDuration.WithLabelValues(label).Observe(getDuration(start))
}
// UpdateLastTime records the time the step identified by the label was started
func UpdateLastTime(label string) {
lastTimestamp.WithLabelValues(label).Set(float64(time.Now().Unix()))
func UpdateLastTime(label string, now time.Time) {
lastActivity.WithLabelValues(label).Set(float64(now.Unix()))
}
// UpdateClusterState updates metrics related to cluster state