Merge pull request #237 from bskiba/split_scale_down
Drill down scale down metrics
This commit is contained in:
commit
2ae609b93a
|
|
@ -65,11 +65,11 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
|
|||
// RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
|
||||
func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("reconfigure", reconfigureStart)
|
||||
metrics.UpdateLastTime(metrics.Reconfigure, reconfigureStart)
|
||||
if err := a.Reconfigure(); err != nil {
|
||||
glog.Errorf("Failed to reconfigure : %v", err)
|
||||
}
|
||||
metrics.UpdateDuration("reconfigure", reconfigureStart)
|
||||
metrics.UpdateDurationFromStart(metrics.Reconfigure, reconfigureStart)
|
||||
return a.autoscaler.RunOnce(currentTime)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,11 +64,11 @@ func (a *PollingAutoscaler) ExitCleanUp() {
|
|||
// RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
|
||||
func (a *PollingAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("poll", reconfigureStart)
|
||||
metrics.UpdateLastTime(metrics.Poll, reconfigureStart)
|
||||
if err := a.Poll(); err != nil {
|
||||
glog.Errorf("Failed to poll : %v", err)
|
||||
}
|
||||
metrics.UpdateDuration("poll", reconfigureStart)
|
||||
metrics.UpdateDurationFromStart(metrics.Poll, reconfigureStart)
|
||||
return a.autoscaler.RunOnce(currentTime)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -203,8 +203,10 @@ func (sd *ScaleDown) UpdateUnneededNodes(
|
|||
// TryToScaleDown tries to scale down the cluster. It returns ScaleDownResult indicating if any node was
|
||||
// removed and error if such occurred.
|
||||
func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs []*policyv1.PodDisruptionBudget) (ScaleDownResult, errors.AutoscalerError) {
|
||||
|
||||
now := time.Now()
|
||||
nodeDeletionDuration := time.Duration(0)
|
||||
findNodesToRemoveDuration := time.Duration(0)
|
||||
defer updateScaleDownMetrics(now, &findNodesToRemoveDuration, &nodeDeletionDuration)
|
||||
candidates := make([]*apiv1.Node, 0)
|
||||
readinessMap := make(map[string]bool)
|
||||
|
||||
|
|
@ -266,55 +268,23 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
|
|||
// to recreate on other nodes.
|
||||
emptyNodes := getEmptyNodes(candidates, pods, sd.context.MaxEmptyBulkDelete, sd.context.CloudProvider)
|
||||
if len(emptyNodes) > 0 {
|
||||
nodeDeletionStart := time.Now()
|
||||
confirmation := make(chan errors.AutoscalerError, len(emptyNodes))
|
||||
for _, node := range emptyNodes {
|
||||
glog.V(0).Infof("Scale-down: removing empty node %s", node.Name)
|
||||
sd.context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDownEmpty", "Scale-down: removing empty node %s", node.Name)
|
||||
simulator.RemoveNodeFromTracker(sd.usageTracker, node.Name, sd.unneededNodes)
|
||||
go func(nodeToDelete *apiv1.Node) {
|
||||
|
||||
deleteErr := deleteNodeFromCloudProvider(nodeToDelete, sd.context.CloudProvider,
|
||||
sd.context.Recorder, sd.context.ClusterStateRegistry)
|
||||
if deleteErr == nil {
|
||||
if readinessMap[nodeToDelete.Name] {
|
||||
metrics.RegisterScaleDown(1, metrics.Empty)
|
||||
} else {
|
||||
metrics.RegisterScaleDown(1, metrics.Unready)
|
||||
}
|
||||
}
|
||||
confirmation <- deleteErr
|
||||
}(node)
|
||||
}
|
||||
var finalError errors.AutoscalerError
|
||||
|
||||
startTime := time.Now()
|
||||
for range emptyNodes {
|
||||
timeElapsed := time.Now().Sub(startTime)
|
||||
timeLeft := MaxCloudProviderNodeDeletionTime - timeElapsed
|
||||
if timeLeft < 0 {
|
||||
finalError = errors.NewAutoscalerError(errors.TransientError, "Failed to delete nodes in time")
|
||||
break
|
||||
}
|
||||
select {
|
||||
case err := <-confirmation:
|
||||
if err != nil {
|
||||
glog.Errorf("Problem with empty node deletion: %v", err)
|
||||
finalError = err
|
||||
}
|
||||
case <-time.After(timeLeft):
|
||||
finalError = errors.NewAutoscalerError(errors.TransientError, "Failed to delete nodes in time")
|
||||
}
|
||||
}
|
||||
if finalError == nil {
|
||||
sd.scheduleDeleteEmptyNodes(emptyNodes, readinessMap, confirmation)
|
||||
err := sd.waitForEmptyNodesDeleted(emptyNodes, confirmation)
|
||||
nodeDeletionDuration = time.Now().Sub(nodeDeletionStart)
|
||||
if err == nil {
|
||||
return ScaleDownNodeDeleted, nil
|
||||
}
|
||||
return ScaleDownError, finalError.AddPrefix("failed to delete at least one empty node: ")
|
||||
return ScaleDownError, err.AddPrefix("failed to delete at least one empty node: ")
|
||||
}
|
||||
|
||||
findNodesToRemoveStart := time.Now()
|
||||
// We look for only 1 node so new hints may be incomplete.
|
||||
nodesToRemove, _, err := simulator.FindNodesToRemove(candidates, nodes, pods, sd.context.ClientSet,
|
||||
sd.context.PredicateChecker, 1, false,
|
||||
sd.podLocationHints, sd.usageTracker, time.Now(), pdbs)
|
||||
findNodesToRemoveDuration = time.Now().Sub(findNodesToRemoveStart)
|
||||
|
||||
if err != nil {
|
||||
return ScaleDownError, err.AddPrefix("Find node to remove failed: ")
|
||||
|
|
@ -336,7 +306,9 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
|
|||
|
||||
// Nothing super-bad should happen if the node is removed from tracker prematurely.
|
||||
simulator.RemoveNodeFromTracker(sd.usageTracker, toRemove.Node.Name, sd.unneededNodes)
|
||||
nodeDeletionStart := time.Now()
|
||||
err = deleteNode(sd.context, toRemove.Node, toRemove.PodsToReschedule)
|
||||
nodeDeletionDuration = time.Now().Sub(nodeDeletionStart)
|
||||
if err != nil {
|
||||
return ScaleDownError, err.AddPrefix("Failed to delete %s: ", toRemove.Node.Name)
|
||||
}
|
||||
|
|
@ -349,6 +321,16 @@ func (sd *ScaleDown) TryToScaleDown(nodes []*apiv1.Node, pods []*apiv1.Pod, pdbs
|
|||
return ScaleDownNodeDeleted, nil
|
||||
}
|
||||
|
||||
// updateScaleDownMetrics registers duration of different parts of scale down.
|
||||
// Separates time spent on finding nodes to remove, deleting nodes and other operations.
|
||||
func updateScaleDownMetrics(scaleDownStart time.Time, findNodesToRemoveDuration *time.Duration, nodeDeletionDuration *time.Duration) {
|
||||
stop := time.Now()
|
||||
miscDuration := stop.Sub(scaleDownStart) - *nodeDeletionDuration - *findNodesToRemoveDuration
|
||||
metrics.UpdateDuration(metrics.ScaleDownNodeDeletion, *nodeDeletionDuration)
|
||||
metrics.UpdateDuration(metrics.ScaleDownFindNodesToRemove, *findNodesToRemoveDuration)
|
||||
metrics.UpdateDuration(metrics.ScaleDownMiscOperations, miscDuration)
|
||||
}
|
||||
|
||||
// This functions finds empty nodes among passed candidates and returns a list of empty nodes
|
||||
// that can be deleted at the same time.
|
||||
func getEmptyNodes(candidates []*apiv1.Node, pods []*apiv1.Pod, maxEmptyBulkDelete int, cloudProvider cloudprovider.CloudProvider) []*apiv1.Node {
|
||||
|
|
@ -391,6 +373,51 @@ func getEmptyNodes(candidates []*apiv1.Node, pods []*apiv1.Pod, maxEmptyBulkDele
|
|||
return result[:limit]
|
||||
}
|
||||
|
||||
func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node,
|
||||
readinessMap map[string]bool, confirmation chan errors.AutoscalerError) {
|
||||
for _, node := range emptyNodes {
|
||||
glog.V(0).Infof("Scale-down: removing empty node %s", node.Name)
|
||||
sd.context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDownEmpty", "Scale-down: removing empty node %s", node.Name)
|
||||
simulator.RemoveNodeFromTracker(sd.usageTracker, node.Name, sd.unneededNodes)
|
||||
go func(nodeToDelete *apiv1.Node) {
|
||||
|
||||
deleteErr := deleteNodeFromCloudProvider(nodeToDelete, sd.context.CloudProvider,
|
||||
sd.context.Recorder, sd.context.ClusterStateRegistry)
|
||||
if deleteErr == nil {
|
||||
if readinessMap[nodeToDelete.Name] {
|
||||
metrics.RegisterScaleDown(1, metrics.Empty)
|
||||
} else {
|
||||
metrics.RegisterScaleDown(1, metrics.Unready)
|
||||
}
|
||||
}
|
||||
confirmation <- deleteErr
|
||||
}(node)
|
||||
}
|
||||
}
|
||||
|
||||
func (sd *ScaleDown) waitForEmptyNodesDeleted(emptyNodes []*apiv1.Node, confirmation chan errors.AutoscalerError) errors.AutoscalerError {
|
||||
var finalError errors.AutoscalerError
|
||||
|
||||
startTime := time.Now()
|
||||
for range emptyNodes {
|
||||
timeElapsed := time.Now().Sub(startTime)
|
||||
timeLeft := MaxCloudProviderNodeDeletionTime - timeElapsed
|
||||
if timeLeft < 0 {
|
||||
return errors.NewAutoscalerError(errors.TransientError, "Failed to delete nodes in time")
|
||||
}
|
||||
select {
|
||||
case err := <-confirmation:
|
||||
if err != nil {
|
||||
glog.Errorf("Problem with empty node deletion: %v", err)
|
||||
finalError = err
|
||||
}
|
||||
case <-time.After(timeLeft):
|
||||
finalError = errors.NewAutoscalerError(errors.TransientError, "Failed to delete nodes in time")
|
||||
}
|
||||
}
|
||||
return finalError
|
||||
}
|
||||
|
||||
func deleteNode(context *AutoscalingContext, node *apiv1.Node, pods []*apiv1.Pod) errors.AutoscalerError {
|
||||
if err := drainNode(node, pods, context.ClientSet, context.Recorder, context.MaxGracefulTerminationSec,
|
||||
MaxPodEvictionTime, EvictionRetryTime); err != nil {
|
||||
|
|
|
|||
|
|
@ -135,8 +135,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return nil
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("updateClusterState", runStart)
|
||||
metrics.UpdateLastTime("autoscaling", time.Now())
|
||||
metrics.UpdateDurationFromStart(metrics.UpdateState, runStart)
|
||||
metrics.UpdateLastTime(metrics.Autoscaling, time.Now())
|
||||
|
||||
// Check if there are any nodes that failed to register in Kubernetes
|
||||
// master.
|
||||
|
|
@ -227,11 +227,11 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
}
|
||||
|
||||
scaleUpStart := time.Now()
|
||||
metrics.UpdateLastTime("scaleUp", scaleUpStart)
|
||||
metrics.UpdateLastTime(metrics.ScaleUp, scaleUpStart)
|
||||
|
||||
scaledUp, typedErr := ScaleUp(autoscalingContext, unschedulablePodsToHelp, readyNodes, daemonsets)
|
||||
|
||||
metrics.UpdateDuration("scaleUp", scaleUpStart)
|
||||
metrics.UpdateDurationFromStart(metrics.ScaleUp, scaleUpStart)
|
||||
|
||||
if typedErr != nil {
|
||||
glog.Errorf("Failed to scale up: %v", typedErr)
|
||||
|
|
@ -271,7 +271,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
return typedErr
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("findUnneeded", unneededStart)
|
||||
metrics.UpdateDurationFromStart(metrics.FindUnneeded, unneededStart)
|
||||
|
||||
for key, val := range scaleDown.unneededNodes {
|
||||
if glog.V(4) {
|
||||
|
|
@ -283,9 +283,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
|
|||
glog.V(4).Infof("Starting scale down")
|
||||
|
||||
scaleDownStart := time.Now()
|
||||
metrics.UpdateLastTime("scaleDown", scaleDownStart)
|
||||
metrics.UpdateLastTime(metrics.ScaleDown, scaleDownStart)
|
||||
result, typedErr := scaleDown.TryToScaleDown(allNodes, allScheduled, pdbs)
|
||||
metrics.UpdateDuration("scaleDown", scaleDownStart)
|
||||
metrics.UpdateDurationFromStart(metrics.ScaleDown, scaleDownStart)
|
||||
|
||||
// TODO: revisit result handling
|
||||
if typedErr != nil {
|
||||
|
|
|
|||
|
|
@ -192,7 +192,7 @@ func run(healthCheck *metrics.HealthCheck) {
|
|||
case <-time.After(*scanInterval):
|
||||
{
|
||||
loopStart := time.Now()
|
||||
metrics.UpdateLastTime("main", loopStart)
|
||||
metrics.UpdateLastTime(metrics.Main, loopStart)
|
||||
healthCheck.UpdateLastActivity(loopStart)
|
||||
|
||||
err := autoscaler.RunOnce(loopStart)
|
||||
|
|
@ -202,7 +202,7 @@ func run(healthCheck *metrics.HealthCheck) {
|
|||
healthCheck.UpdateLastSuccessfulRun(time.Now())
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("main", loopStart)
|
||||
metrics.UpdateDurationFromStart(metrics.Main, loopStart)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,6 +30,10 @@ import (
|
|||
// NodeScaleDownReason describes reason for removing node
|
||||
type NodeScaleDownReason string
|
||||
|
||||
// FunctionLabel is a name of Cluster Autoscaler operation for which
|
||||
// we measure duration
|
||||
type FunctionLabel string
|
||||
|
||||
const (
|
||||
caNamespace = "cluster_autoscaler"
|
||||
readyLabel = "ready"
|
||||
|
|
@ -50,6 +54,21 @@ const (
|
|||
LogLongDurationThreshold = 5 * time.Second
|
||||
)
|
||||
|
||||
// Names of Cluster Autoscaler operations
|
||||
const (
|
||||
ScaleDown FunctionLabel = "scaleDown"
|
||||
ScaleDownNodeDeletion FunctionLabel = "scaleDown:nodeDeletion"
|
||||
ScaleDownFindNodesToRemove FunctionLabel = "scaleDown:findNodesToRemove"
|
||||
ScaleDownMiscOperations FunctionLabel = "scaleDown:miscOperations"
|
||||
ScaleUp FunctionLabel = "scaleUp"
|
||||
FindUnneeded FunctionLabel = "findUnneeded"
|
||||
UpdateState FunctionLabel = "updateClusterState"
|
||||
Main FunctionLabel = "main"
|
||||
Poll FunctionLabel = "poll"
|
||||
Reconfigure FunctionLabel = "reconfigure"
|
||||
Autoscaling FunctionLabel = "autoscaling"
|
||||
)
|
||||
|
||||
var (
|
||||
/**** Metrics related to cluster state ****/
|
||||
clusterSafeToAutoscale = prometheus.NewGauge(
|
||||
|
|
@ -149,20 +168,26 @@ func init() {
|
|||
prometheus.MustRegister(unneededNodesCount)
|
||||
}
|
||||
|
||||
// UpdateDuration records the duration of the step identified by the label
|
||||
func UpdateDuration(label string, start time.Time) {
|
||||
// UpdateDurationFromStart records the duration of the step identified by the
|
||||
// label using start time
|
||||
func UpdateDurationFromStart(label FunctionLabel, start time.Time) {
|
||||
duration := time.Now().Sub(start)
|
||||
UpdateDuration(label, duration)
|
||||
}
|
||||
|
||||
// UpdateDuration records the duration of the step identified by the label
|
||||
func UpdateDuration(label FunctionLabel, duration time.Duration) {
|
||||
// TODO(maciekpytel): remove second condition if we manage to get
|
||||
// asynchronous node drain
|
||||
if duration > LogLongDurationThreshold && label != "scaleDown" {
|
||||
if duration > LogLongDurationThreshold && label != ScaleDown {
|
||||
glog.Infof("Function %s took %v to complete", label, duration)
|
||||
}
|
||||
functionDuration.WithLabelValues(label).Observe(duration.Seconds())
|
||||
functionDuration.WithLabelValues(string(label)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// UpdateLastTime records the time the step identified by the label was started
|
||||
func UpdateLastTime(label string, now time.Time) {
|
||||
lastActivity.WithLabelValues(label).Set(float64(now.Unix()))
|
||||
func UpdateLastTime(label FunctionLabel, now time.Time) {
|
||||
lastActivity.WithLabelValues(string(label)).Set(float64(now.Unix()))
|
||||
}
|
||||
|
||||
// UpdateClusterState updates metrics related to cluster state
|
||||
|
|
|
|||
Loading…
Reference in New Issue