Merge pull request #7708 from comtalyst/comtalyst/dont-crash-no-vmss
fix: don't crash when vmss not present or has no nodes
This commit is contained in:
commit
31f561693d
|
|
@ -170,7 +170,7 @@ func (m *azureCache) regenerate() error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
klog.V(4).Infof("regenerate: found nodes for node group %s: %+v", ng.Id(), instances)
|
||||
klog.V(4).Infof("regenerate: found %d nodes for node group %s: %+v", len(instances), ng.Id(), instances)
|
||||
|
||||
for _, instance := range instances {
|
||||
ref := azureRef{Name: instance.Id}
|
||||
|
|
|
|||
|
|
@ -1249,6 +1249,49 @@ func TestGetScaleSetOptions(t *testing.T) {
|
|||
assert.Equal(t, *opts, defaultOptions)
|
||||
}
|
||||
|
||||
// TestVMSSNotFound ensures that AzureManager is still able to be built
|
||||
// if one nodeGroup (VMSS) is not found. Previously, we would fail on manager creation
|
||||
// if even one expected nodeGroup was not found. When manager creation errored out,
|
||||
// BuildAzure returns log.Fatalf() which caused CAS to crash.
|
||||
func TestVMSSNotFound(t *testing.T) {
|
||||
// client setup
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
|
||||
mockVMClient := mockvmclient.NewMockInterface(ctrl)
|
||||
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
|
||||
client := azClient{}
|
||||
client.virtualMachineScaleSetsClient = mockVMSSClient
|
||||
client.virtualMachinesClient = mockVMClient
|
||||
client.virtualMachineScaleSetVMsClient = mockVMSSVMClient
|
||||
|
||||
// Expect that no vmss are present in the vmss client
|
||||
mockVMSSVMClient.EXPECT().List(gomock.Any(), "fakeId", testASG, gomock.Any()).Return([]compute.VirtualMachineScaleSetVM{}, nil).AnyTimes()
|
||||
mockVMClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachine{}, nil).AnyTimes()
|
||||
mockVMSSClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachineScaleSet{}, nil).AnyTimes()
|
||||
|
||||
// Add explicit node group to look for during init
|
||||
ngdo := cloudprovider.NodeGroupDiscoveryOptions{
|
||||
NodeGroupSpecs: []string{
|
||||
fmt.Sprintf("%d:%d:%s", 1, 3, testASG),
|
||||
},
|
||||
}
|
||||
|
||||
// We expect the initial BuildAzure flow to pass when a NodeGroup is detected
|
||||
// that doesn't have a corresponding VMSS in the cache.
|
||||
t.Run("should not error when VMSS not found in cache", func(t *testing.T) {
|
||||
manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfg), ngdo, &client)
|
||||
assert.NoError(t, err)
|
||||
// expect one nodegroup to be present
|
||||
nodeGroups := manager.getNodeGroups()
|
||||
assert.Len(t, nodeGroups, 1)
|
||||
assert.Equal(t, nodeGroups[0].Id(), testASG)
|
||||
// expect no scale sets to be present
|
||||
scaleSets := manager.azureCache.getScaleSets()
|
||||
assert.Len(t, scaleSets, 0)
|
||||
})
|
||||
}
|
||||
|
||||
func assertStructsMinimallyEqual(t *testing.T, struct1, struct2 interface{}) bool {
|
||||
return compareStructFields(t, reflect.ValueOf(struct1), reflect.ValueOf(struct2))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -167,7 +167,12 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool {
|
|||
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
|
||||
template, err := scaleSet.getVMSSFromCache()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
klog.Errorf("failed to get information for VMSS: %s", scaleSet.Name)
|
||||
// Note: We don't return an error here and instead accept defaults.
|
||||
// Every invocation of GetOptions() returns an error if this condition is met:
|
||||
// `if err != nil && err != cloudprovider.ErrNotImplemented`
|
||||
// The error return value is intended to only capture unimplemented.
|
||||
return nil, nil
|
||||
}
|
||||
return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil
|
||||
}
|
||||
|
|
@ -187,14 +192,14 @@ func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, er
|
|||
return allVMSS[scaleSet.Name], nil
|
||||
}
|
||||
|
||||
func (scaleSet *ScaleSet) getCurSize() (int64, error) {
|
||||
func (scaleSet *ScaleSet) getCurSize() (int64, *GetVMSSFailedError) {
|
||||
scaleSet.sizeMutex.Lock()
|
||||
defer scaleSet.sizeMutex.Unlock()
|
||||
|
||||
set, err := scaleSet.getVMSSFromCache()
|
||||
if err != nil {
|
||||
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
|
||||
return -1, err
|
||||
return -1, newGetVMSSFailedError(err, true)
|
||||
}
|
||||
|
||||
// // Remove check for returning in-memory size when VMSS is in updating state
|
||||
|
|
@ -231,7 +236,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
|
|||
set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name)
|
||||
if rerr != nil {
|
||||
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr)
|
||||
return -1, err
|
||||
return -1, newGetVMSSFailedError(rerr.Error(), rerr.IsNotFound())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -253,12 +258,11 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
|
|||
|
||||
// getScaleSetSize gets Scale Set size.
|
||||
func (scaleSet *ScaleSet) getScaleSetSize() (int64, error) {
|
||||
// First, get the size of the ScaleSet reported by API
|
||||
// -1 indiciates the ScaleSet hasn't been initialized
|
||||
size, err := scaleSet.getCurSize()
|
||||
if size == -1 || err != nil {
|
||||
klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, err)
|
||||
return size, err
|
||||
// First, get the current size of the ScaleSet
|
||||
size, getVMSSError := scaleSet.getCurSize()
|
||||
if size == -1 || getVMSSError != nil {
|
||||
klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, getVMSSError.error)
|
||||
return size, getVMSSError.error
|
||||
}
|
||||
return size, nil
|
||||
}
|
||||
|
|
@ -666,10 +670,13 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*framework.NodeInfo, error) {
|
|||
|
||||
// Nodes returns a list of all nodes that belong to this node group.
|
||||
func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
|
||||
curSize, err := scaleSet.getCurSize()
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, err)
|
||||
return nil, err
|
||||
curSize, getVMSSError := scaleSet.getCurSize()
|
||||
if getVMSSError != nil {
|
||||
klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, getVMSSError.error)
|
||||
if getVMSSError.notFound {
|
||||
return []cloudprovider.Instance{}, nil // Don't return error if VMSS not found
|
||||
}
|
||||
return nil, getVMSSError.error // We want to return error if other errors occur.
|
||||
}
|
||||
|
||||
scaleSet.instanceMutex.Lock()
|
||||
|
|
@ -682,7 +689,7 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
|
|||
}
|
||||
|
||||
// Forcefully updating the instanceCache as the instanceCacheSize didn't match curSize or cache is invalid.
|
||||
err = scaleSet.updateInstanceCache()
|
||||
err := scaleSet.updateInstanceCache()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -901,3 +908,21 @@ func (scaleSet *ScaleSet) verifyNodeGroup(instance *azureRef, commonNgID string)
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetVMSSFailedError is used to differentiate between
|
||||
// NotFound and other errors
|
||||
type GetVMSSFailedError struct {
|
||||
notFound bool
|
||||
error error
|
||||
}
|
||||
|
||||
func newGetVMSSFailedError(error error, notFound bool) *GetVMSSFailedError {
|
||||
return &GetVMSSFailedError{
|
||||
error: error,
|
||||
notFound: notFound,
|
||||
}
|
||||
}
|
||||
|
||||
func (v *GetVMSSFailedError) Error() string {
|
||||
return v.error.Error()
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue