Merge pull request #7481 from jackfrancis/vmss-proactive-deleting

azure: StrictCacheUpdates to disable proactive vmss cache updates
This commit is contained in:
Kubernetes Prow Robot 2024-11-11 18:52:46 +00:00 committed by GitHub
commit 93f74c0948
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 15 deletions

View File

@ -94,6 +94,9 @@ type Config struct {
// (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance // (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance
GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"` GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"`
// StrictCacheUpdates updates cache values only after positive validation from Azure APIs
StrictCacheUpdates bool `json:"strictCacheUpdates,omitempty" yaml:"strictCacheUpdates,omitempty"`
} }
// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name. // These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
@ -122,6 +125,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
cfg.CloudProviderBackoffJitter = providerazureconsts.BackoffJitterDefault cfg.CloudProviderBackoffJitter = providerazureconsts.BackoffJitterDefault
cfg.VMType = providerazureconsts.VMTypeVMSS cfg.VMType = providerazureconsts.VMTypeVMSS
cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount) cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
cfg.StrictCacheUpdates = false
// Config file overrides defaults // Config file overrides defaults
if configReader != nil { if configReader != nil {
@ -247,6 +251,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
if _, err = assignBoolFromEnvIfExists(&cfg.EnableForceDelete, "AZURE_ENABLE_FORCE_DELETE"); err != nil { if _, err = assignBoolFromEnvIfExists(&cfg.EnableForceDelete, "AZURE_ENABLE_FORCE_DELETE"); err != nil {
return nil, err return nil, err
} }
if _, err = assignBoolFromEnvIfExists(&cfg.StrictCacheUpdates, "AZURE_STRICT_CACHE_UPDATES"); err != nil {
return nil, err
}
if _, err = assignBoolFromEnvIfExists(&cfg.EnableDynamicInstanceList, "AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); err != nil { if _, err = assignBoolFromEnvIfExists(&cfg.EnableDynamicInstanceList, "AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); err != nil {
return nil, err return nil, err
} }

View File

@ -530,19 +530,22 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered
return rerr.Error() return rerr.Error()
} }
// Proactively decrement scale set size so that we don't if !scaleSet.manager.config.StrictCacheUpdates {
// go below minimum node count if cache data is stale // Proactively decrement scale set size so that we don't
// only do it for non-unregistered nodes // go below minimum node count if cache data is stale
if !hasUnregisteredNodes { // only do it for non-unregistered nodes
scaleSet.sizeMutex.Lock()
scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.lastSizeRefresh = time.Now()
scaleSet.sizeMutex.Unlock()
}
// Proactively set the status of the instances to be deleted in cache if !hasUnregisteredNodes {
for _, instance := range instancesToDelete { scaleSet.sizeMutex.Lock()
scaleSet.setInstanceStatusByProviderID(instance.Name, cloudprovider.InstanceStatus{State: cloudprovider.InstanceDeleting}) scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.lastSizeRefresh = time.Now()
scaleSet.sizeMutex.Unlock()
}
// Proactively set the status of the instances to be deleted in cache
for _, instance := range instancesToDelete {
scaleSet.setInstanceStatusByProviderID(instance.Name, cloudprovider.InstanceStatus{State: cloudprovider.InstanceDeleting})
}
} }
go scaleSet.waitForDeleteInstances(future, requiredIds) go scaleSet.waitForDeleteInstances(future, requiredIds)
@ -558,11 +561,18 @@ func (scaleSet *ScaleSet) waitForDeleteInstances(future *azure.Future, requiredI
isSuccess, err := isSuccessHTTPResponse(httpResponse, err) isSuccess, err := isSuccessHTTPResponse(httpResponse, err)
if isSuccess { if isSuccess {
klog.V(3).Infof(".WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name) klog.V(3).Infof(".WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name)
// No need to invalidateInstanceCache because instanceStates were proactively set to "deleting" if scaleSet.manager.config.StrictCacheUpdates {
if err := scaleSet.manager.forceRefresh(); err != nil {
klog.Errorf("forceRefresh failed with error: %v", err)
}
scaleSet.invalidateInstanceCache()
}
return return
} }
// On failure, invalidate the instanceCache - cannot have instances in deletingState if !scaleSet.manager.config.StrictCacheUpdates {
scaleSet.invalidateInstanceCache() // On failure, invalidate the instanceCache - cannot have instances in deletingState
scaleSet.invalidateInstanceCache()
}
klog.Errorf("WaitForDeleteInstancesResult(%v) for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err) klog.Errorf("WaitForDeleteInstancesResult(%v) for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err)
} }