correctly classify error for failed scale ups

This commit is contained in:
Marwan Ahmed 2020-09-13 21:14:27 -07:00
parent 63259fb5dd
commit a3bada3708
3 changed files with 4 additions and 2 deletions

View File

@ -287,7 +287,7 @@ func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroup cloudprovider.NodeGr
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, currentTime time.Time) {
csr.Lock()
defer csr.Unlock()
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, "cloudProviderError", currentTime)
csr.registerFailedScaleUpNoLock(nodeGroup, reason, cloudprovider.OtherErrorClass, string(reason), currentTime)
}
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, currentTime time.Time) {

View File

@ -666,7 +666,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
increase := info.NewSize - info.CurrentSize
if err := info.Group.IncreaseSize(increase); err != nil {
context.LogRecorder.Eventf(apiv1.EventTypeWarning, "FailedToScaleUpGroup", "Scale-up failed for group %s: %v", info.Group.Id(), err)
clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.APIError, now)
clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.CloudProviderError, now)
return errors.NewAutoscalerError(errors.CloudProviderError,
"failed to increase node group size: %v", err)
}

View File

@ -56,6 +56,8 @@ const (
// Unready node was removed
Unready NodeScaleDownReason = "unready"
// CloudProviderError caused scale-up to fail
CloudProviderError FailedScaleUpReason = "cloudProviderError"
// APIError caused scale-up to fail
APIError FailedScaleUpReason = "apiCallError"
// Timeout was encountered when trying to scale-up