cluster-autoscaler/skip-node: unblock cluster autoscaler when having a single nodegroup for node error

Signed-off-by: Julien Balestra <julien.balestra@datadoghq.com>
This commit is contained in:
Julien Balestra 2019-09-09 15:44:46 +02:00
parent 7dbdb9be6b
commit 3441f616e1
2 changed files with 30 additions and 19 deletions

View File

@ -58,13 +58,14 @@ type GceCache struct {
cacheMutex sync.Mutex
// Cache content.
migs map[GceRef]Mig
instanceRefToMigRef map[GceRef]GceRef
resourceLimiter *cloudprovider.ResourceLimiter
machinesCache map[MachineTypeKey]*gce.MachineType
migTargetSizeCache map[GceRef]int64
migBaseNameCache map[GceRef]string
instanceTemplatesCache map[GceRef]*gce.InstanceTemplate
migs map[GceRef]Mig
instanceRefToMigRef map[GceRef]GceRef
instancesFromUnknownMigs map[GceRef]struct{}
resourceLimiter *cloudprovider.ResourceLimiter
machinesCache map[MachineTypeKey]*gce.MachineType
migTargetSizeCache map[GceRef]int64
migBaseNameCache map[GceRef]string
instanceTemplatesCache map[GceRef]*gce.InstanceTemplate
// Service used to refresh cache.
GceService AutoscalingGceClient
@ -73,13 +74,14 @@ type GceCache struct {
// NewGceCache creates empty GceCache.
func NewGceCache(gceService AutoscalingGceClient) *GceCache {
return &GceCache{
migs: map[GceRef]Mig{},
instanceRefToMigRef: map[GceRef]GceRef{},
machinesCache: map[MachineTypeKey]*gce.MachineType{},
migTargetSizeCache: map[GceRef]int64{},
migBaseNameCache: map[GceRef]string{},
instanceTemplatesCache: map[GceRef]*gce.InstanceTemplate{},
GceService: gceService,
migs: map[GceRef]Mig{},
instanceRefToMigRef: map[GceRef]GceRef{},
instancesFromUnknownMigs: map[GceRef]struct{}{},
machinesCache: map[MachineTypeKey]*gce.MachineType{},
migTargetSizeCache: map[GceRef]int64{},
migBaseNameCache: map[GceRef]string{},
instanceTemplatesCache: map[GceRef]*gce.InstanceTemplate{},
GceService: gceService,
}
}
@ -114,7 +116,7 @@ func (gc *GceCache) UnregisterMig(toBeRemoved Mig) bool {
if found {
klog.V(1).Infof("Unregistered Mig %s", toBeRemoved.GceRef().String())
delete(gc.migs, toBeRemoved.GceRef())
gc.removeInstancesForMig(toBeRemoved.GceRef())
gc.removeInstancesForMigs(toBeRemoved.GceRef())
return true
}
return false
@ -157,6 +159,8 @@ func (gc *GceCache) GetMigForInstance(instanceRef GceRef) (Mig, error) {
return nil, fmt.Errorf("instance %+v belongs to unregistered mig %+v", instanceRef, migRef)
}
return mig, nil
} else if _, found := gc.instancesFromUnknownMigs[instanceRef]; found {
return nil, nil
}
for _, migRef := range gc.getMigRefs() {
@ -182,7 +186,9 @@ func (gc *GceCache) GetMigForInstance(instanceRef GceRef) (Mig, error) {
migRef, found := gc.instanceRefToMigRef[instanceRef]
if !found {
return nil, fmt.Errorf("instance %+v belongs to unknown mig", instanceRef)
klog.Warningf("instance %+v belongs to unknown mig", instanceRef)
gc.instancesFromUnknownMigs[instanceRef] = struct{}{}
return nil, nil
}
mig, found := gc.getMigNoLock(migRef)
if !found {
@ -195,10 +201,11 @@ func (gc *GceCache) GetMigForInstance(instanceRef GceRef) (Mig, error) {
return nil, nil
}
func (gc *GceCache) removeInstancesForMig(migRef GceRef) {
func (gc *GceCache) removeInstancesForMigs(migRef GceRef) {
for instanceRef, instanceMigRef := range gc.instanceRefToMigRef {
if migRef == instanceMigRef {
delete(gc.instanceRefToMigRef, instanceRef)
delete(gc.instancesFromUnknownMigs, instanceRef)
}
}
}
@ -219,7 +226,7 @@ func (gc *GceCache) regenerateInstanceCacheForMigNoLock(migRef GceRef) error {
klog.V(4).Infof("Regenerating MIG information for %s", migRef.String())
// cleanup old entries
gc.removeInstancesForMig(migRef)
gc.removeInstancesForMigs(migRef)
instances, err := gc.GceService.FetchMigInstances(migRef)
if err != nil {
@ -242,6 +249,7 @@ func (gc *GceCache) RegenerateInstancesCache() error {
defer gc.cacheMutex.Unlock()
gc.instanceRefToMigRef = make(map[GceRef]GceRef)
gc.instancesFromUnknownMigs = make(map[GceRef]struct{})
for _, migRef := range gc.getMigRefs() {
err := gc.regenerateInstanceCacheForMigNoLock(migRef)
if err != nil {

View File

@ -194,6 +194,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
stateUpdateStart := time.Now()
allNodes, readyNodes, typedErr := a.obtainNodeLists(a.CloudProvider)
if typedErr != nil {
klog.Errorf("Failed to get node list: %v", typedErr)
return typedErr
}
if a.actOnEmptyCluster(allNodes, readyNodes, currentTime) {
@ -202,7 +203,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil {
klog.Errorf("Failed to get daemonset list")
klog.Errorf("Failed to get daemonset list: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
@ -216,11 +217,13 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
nodeInfosForGroups, autoscalerError := getNodeInfosForGroups(
readyNodes, a.nodeInfoCache, autoscalingContext.CloudProvider, autoscalingContext.ListerRegistry, daemonsets, autoscalingContext.PredicateChecker, a.ignoredTaints)
if autoscalerError != nil {
klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)
return autoscalerError.AddPrefix("failed to build node infos for node groups: ")
}
typedErr = a.updateClusterState(allNodes, nodeInfosForGroups, currentTime)
if typedErr != nil {
klog.Errorf("Failed to update cluster state: %v", typedErr)
return typedErr
}
metrics.UpdateDurationFromStart(metrics.UpdateState, stateUpdateStart)