Merge pull request #5548 from DataDog/azure-fast-backoff

Azure: Fast nodegroup backoff on failed provisioning
This commit is contained in:
Kubernetes Prow Robot 2023-04-25 14:36:16 -07:00 committed by GitHub
commit 0142a57730
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 49 additions and 0 deletions

View File

@ -679,6 +679,13 @@ func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovid
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
default:
status.State = cloudprovider.InstanceRunning
}

View File

@ -225,6 +225,48 @@ func TestIncreaseSize(t *testing.T) {
}
}
func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()
manager := newTestAzureManager(t)
vmssName := "vmss-failed-upscale"
expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform)
expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed))
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil)
mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
manager.explicitlyConfigured["vmss-failed-upscale"] = true
registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName))
assert.True(t, registered)
manager.Refresh()
provider, err := BuildAzureCloudProvider(manager, nil)
assert.NoError(t, err)
scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet)
assert.True(t, ok)
// Increase size by one, but the new node fails provisioning
err = scaleSet.IncreaseSize(1)
assert.NoError(t, err)
nodes, err := scaleSet.Nodes()
assert.NoError(t, err)
assert.Equal(t, 3, len(nodes))
assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State)
assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass)
}
func TestIncreaseSizeOnVMSSUpdating(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()