Added CA metrics related to autoscaler execution

2017-05-11 14:28:49 +02:00 · 2017-05-11 14:28:49 +02:00 · 4cdf06ea94
parent e17f3507d4
commit 4cdf06ea94
5 changed files with 26 additions and 32 deletions
--- a/cluster-autoscaler/core/dynamic_autoscaler.go
+++ b/cluster-autoscaler/core/dynamic_autoscaler.go
@ -58,7 +58,7 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
 // RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
 func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) {
 	reconfigureStart := time.Now()
-	metrics.UpdateLastTime("reconfigure")
+	metrics.UpdateLastTime("reconfigure", reconfigureStart)
 	if err := a.Reconfigure(); err != nil {
 		glog.Errorf("Failed to reconfigure : %v", err)
 	}
--- a/cluster-autoscaler/core/polling_autoscaler.go
+++ b/cluster-autoscaler/core/polling_autoscaler.go
@ -50,7 +50,7 @@ func (a *PollingAutoscaler) ExitCleanUp() {
 // RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
 func (a *PollingAutoscaler) RunOnce(currentTime time.Time) {
 	reconfigureStart := time.Now()
-	metrics.UpdateLastTime("poll")
+	metrics.UpdateLastTime("poll", reconfigureStart)
 	if err := a.Poll(); err != nil {
 		glog.Errorf("Failed to poll : %v", err)
 	}
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -80,6 +80,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 	pdbLister := a.PodDisruptionBudgetLister()
 	scaleDown := a.scaleDown
 	autoscalingContext := a.AutoscalingContext
+	runStart := time.Now()

 	readyNodes, err := readyNodeLister.List()
 	if err != nil {
@ -124,6 +125,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 		return
 	}

+	metrics.UpdateDuration("updateClusterState", runStart)
+	metrics.UpdateLastTime("autoscaling", time.Now())
+
 	// Check if there are any nodes that failed to register in kuberentes
 	// master.
 	unregisteredNodes := a.ClusterStateRegistry.GetUnregisteredNodes()
@ -215,7 +219,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 		glog.V(1).Info("Max total nodes in cluster reached")
 	} else {
 		scaleUpStart := time.Now()
-		metrics.UpdateLastTime("scaleup")
+		metrics.UpdateLastTime("scaleUp", scaleUpStart)
 		scaledUp, err := ScaleUp(autoscalingContext, unschedulablePodsToHelp, readyNodes)

 		metrics.UpdateDuration("scaleup", scaleUpStart)
@ -248,7 +252,6 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 			"lastScaleDownFailedTrail=%s schedulablePodsPresent=%v", calculateUnneededOnly,
 			a.lastScaleUpTime, a.lastScaleDownFailedTrial, schedulablePodsPresent)

-		metrics.UpdateLastTime("findUnneeded")
 		glog.V(4).Infof("Calculating unneeded nodes")

 		scaleDown.CleanUp(time.Now())
@ -270,9 +273,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 			glog.V(4).Infof("Starting scale down")

 			scaleDownStart := time.Now()
-			metrics.UpdateLastTime("scaledown")
+			metrics.UpdateLastTime("scaleDown", scaleDownStart)
 			result, err := scaleDown.TryToScaleDown(allNodes, allScheduled, pdbs)
-			metrics.UpdateDuration("scaledown", scaleDownStart)
+			metrics.UpdateDuration("scaleDown", scaleDownStart)

 			// TODO: revisit result handling
 			if err != nil {
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -189,7 +189,7 @@ func run(_ <-chan struct{}) {
 		case <-time.After(*scanInterval):
 			{
 				loopStart := time.Now()
-				metrics.UpdateLastTime("main")
+				metrics.UpdateLastTime("main", loopStart)

 				autoscaler.RunOnce(loopStart)

--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@ -58,53 +58,44 @@ var (
 		},
 	)

-	lastTimestamp = prometheus.NewGaugeVec(
+	/**** Metrics related to autoscaler execution ****/
+	lastActivity = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Namespace: caNamespace,
-			Name:      "last_time_seconds",
-			Help:      "Last time CA run some main loop fragment.",
-		}, []string{"main"},
+			Name:      "last_activity",
+			Help:      "Last time certain part of CA logic executed.",
+		}, []string{"activity"},
 	)

-	lastDuration = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Namespace: caNamespace,
-			Name:      "last_duration_microseconds",
-			Help:      "Time spent in last main loop fragments in microseconds.",
-		}, []string{"main"},
-	)
-
-	duration = prometheus.NewSummaryVec(
+	functionDuration = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
 			Namespace: caNamespace,
-			Name:      "duration_microseconds",
-			Help:      "Time spent in main loop fragments in microseconds.",
-		}, []string{"main"},
+			Name:      "function_duration_seconds",
+			Help:      "Time taken by various parts of CA main loop.",
+		}, []string{"function"},
 	)
 )

 func init() {
-	prometheus.MustRegister(duration)
-	prometheus.MustRegister(lastDuration)
-	prometheus.MustRegister(lastTimestamp)
 	prometheus.MustRegister(clusterSafeToAutoscale)
 	prometheus.MustRegister(nodesCount)
 	prometheus.MustRegister(unschedulablePodsCount)
+	prometheus.MustRegister(lastActivity)
+	prometheus.MustRegister(functionDuration)
 }

-func durationToMicro(start time.Time) float64 {
-	return float64(time.Now().Sub(start).Nanoseconds() / 1000)
+func getDuration(start time.Time) float64 {
+	return time.Now().Sub(start).Seconds()
 }

 // UpdateDuration records the duration of the step identified by the label
 func UpdateDuration(label string, start time.Time) {
-	duration.WithLabelValues(label).Observe(durationToMicro(start))
-	lastDuration.WithLabelValues(label).Set(durationToMicro(start))
+	functionDuration.WithLabelValues(label).Observe(getDuration(start))
 }

 // UpdateLastTime records the time the step identified by the label was started
-func UpdateLastTime(label string) {
-	lastTimestamp.WithLabelValues(label).Set(float64(time.Now().Unix()))
+func UpdateLastTime(label string, now time.Time) {
+	lastActivity.WithLabelValues(label).Set(float64(now.Unix()))
 }

 // UpdateClusterState updates metrics related to cluster state