From 1ae92f7bc46a3ece1e28d3ee0babc4b7736a2da0 Mon Sep 17 00:00:00 2001 From: Pengfei Ni Date: Sun, 5 Apr 2020 06:30:33 +0000 Subject: [PATCH] Ensure VMSS is not under updating before scaling out --- .../cloudprovider/azure/azure_fakes.go | 2 +- .../cloudprovider/azure/azure_scale_set.go | 19 +++++-- .../azure/azure_scale_set_test.go | 56 +++++++++++++++++++ 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go index 3027e4861d..ff5ab484d1 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go @@ -80,7 +80,7 @@ func (client *VirtualMachineScaleSetsClientMock) CreateOrUpdateAsync(ctx context // WaitForAsyncOperationResult waits for the response of the request func (client *VirtualMachineScaleSetsClientMock) WaitForAsyncOperationResult(ctx context.Context, future *azure.Future) (*http.Response, error) { - return nil, nil + return &http.Response{StatusCode: http.StatusOK}, nil } // DeleteInstances deletes a set of instances for specified VirtualMachineScaleSet. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 1afa9f3d61..4b82446ec6 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -40,6 +40,7 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute" "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/azure" + "github.com/Azure/go-autorest/autorest/to" ) var ( @@ -253,7 +254,7 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) { return } - klog.Errorf("virtualMachineScaleSetsClient.WaitForCreateOrUpdate for scale set %q failed: %v", scaleSet.Name, err) + klog.Errorf("virtualMachineScaleSetsClient.WaitForAsyncOperationResult for scale set %q failed: %v", scaleSet.Name, err) } // SetScaleSetSize sets ScaleSet size. @@ -261,19 +262,27 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { scaleSet.sizeMutex.Lock() defer scaleSet.sizeMutex.Unlock() - // Proactively set the VMSS size so autoscaler makes better decisions. - scaleSet.curSize = size - scaleSet.lastSizeRefresh = time.Now() - vmssInfo, rerr := scaleSet.getVMSSInfo() if rerr != nil { klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, rerr) return rerr.Error() } + // Abort scaling to avoid concurrent VMSS scaling if the VMSS is still under updating. + // Note that the VMSS provisioning state would be updated per scaleSet.sizeRefreshPeriod. + if vmssInfo.VirtualMachineScaleSetProperties != nil && strings.EqualFold(to.String(vmssInfo.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.ProvisioningStateUpdating)) { + klog.Errorf("VMSS %q is still under updating, waiting for it finishes before scaling", scaleSet.Name) + return fmt.Errorf("VMSS %q is still under updating", scaleSet.Name) + } + + // Proactively set the VMSS size so autoscaler makes better decisions. + scaleSet.curSize = size + scaleSet.lastSizeRefresh = time.Now() + // Update the new capacity to cache. vmssSizeMutex.Lock() vmssInfo.Sku.Capacity = &size + vmssInfo.VirtualMachineScaleSetProperties.ProvisioningState = to.StringPtr(string(compute.ProvisioningStateUpdating)) vmssSizeMutex.Unlock() // Compose a new VMSS for updating. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 28e1c88dd1..a6f0a66a10 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -20,9 +20,11 @@ import ( "fmt" "net/http" "testing" + "time" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute" "github.com/Azure/go-autorest/autorest" + "github.com/Azure/go-autorest/autorest/to" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -97,6 +99,60 @@ func TestIncreaseSize(t *testing.T) { assert.Equal(t, 5, targetSize) } +func TestIncreaseSizeOnVMSSUpdating(t *testing.T) { + manager := newTestAzureManager(t) + vmssName := "vmss-updating" + var vmssCapacity int64 = 3 + scaleSetClient := &VirtualMachineScaleSetsClientMock{ + FakeStore: map[string]map[string]compute.VirtualMachineScaleSet{ + "test": { + vmssName: { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + ProvisioningState: to.StringPtr(string(compute.ProvisioningStateUpdating)), + }, + }, + }, + }, + } + manager.azClient.virtualMachineScaleSetsClient = scaleSetClient + registered := manager.RegisterAsg(newTestScaleSet(manager, vmssName)) + assert.True(t, registered) + manager.regenerateCache() + + provider, err := BuildAzureCloudProvider(manager, nil) + assert.NoError(t, err) + + // Scaling should fail because VMSS is still under updating. + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + err = scaleSet.IncreaseSize(1) + assert.Equal(t, fmt.Errorf("VMSS %q is still under updating", scaleSet.Name), err) + + // Scaling should succeed after VMSS ProvisioningState changed to succeeded. + scaleSetClient.FakeStore = map[string]map[string]compute.VirtualMachineScaleSet{ + "test": { + vmssName: { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + ProvisioningState: to.StringPtr(string(compute.ProvisioningStateSucceeded)), + }, + }, + }, + } + scaleSetStatusCache.mutex.Lock() + scaleSetStatusCache.lastRefresh = time.Now().Add(-1 * scaleSet.sizeRefreshPeriod) + scaleSetStatusCache.mutex.Unlock() + err = scaleSet.IncreaseSize(1) + assert.NoError(t, err) +} + func TestBelongs(t *testing.T) { provider := newTestProvider(t) registered := provider.azureManager.RegisterAsg(