Add detection for VMs that fail provisioning to backoff that nodegroup sooner

When Azure fails to provision a node for a nodegroup due to an instance capacity issue ((Zonal)AllocationFailed) or other reason, the VMSS size increase is still reflected but the new instance gets the status `ProvisioningStateFailed`. This now bubbles up the error to the `cloudprovider.Instance`, where it can be used by in `clusterstate` to put the nodegroup into backoff sooner.
This commit is contained in:
dom.bozzuto 2023-01-30 14:53:27 -05:00
parent 240ac79eed
commit 066315cfa2
2 changed files with 49 additions and 0 deletions

View File

@ -679,6 +679,13 @@ func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovid
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
default:
status.State = cloudprovider.InstanceRunning
}

View File

@ -225,6 +225,48 @@ func TestIncreaseSize(t *testing.T) {
}
}
func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
manager := newTestAzureManager(t)
vmssName := "vmss-failed-upscale"
expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform)
expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed))
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil)
mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
manager.explicitlyConfigured["vmss-failed-upscale"] = true
registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName))
assert.True(t, registered)
manager.Refresh()
provider, err := BuildAzureCloudProvider(manager, nil)
assert.NoError(t, err)
scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
assert.True(t, ok)
// Increase size by one, but the new node fails provisioning
err = scaleSet.IncreaseSize(1)
assert.NoError(t, err)
nodes, err := scaleSet.Nodes()
assert.NoError(t, err)
assert.Equal(t, 3, len(nodes))
assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State)
assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass)
}
func TestIncreaseSizeOnVMSSUpdating(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()