From b898c1f15114c6be6875f7f20defb5630f21874d Mon Sep 17 00:00:00 2001 From: Joel Speed Date: Thu, 13 Feb 2020 11:08:32 +0000 Subject: [PATCH] Provide fake proivder IDs for failed machines --- .../clusterapi/clusterapi_controller.go | 32 +++++++++++++++++++ .../clusterapi/clusterapi_controller_test.go | 12 ++++++- .../clusterapi/clusterapi_converters.go | 8 +++++ .../cloudprovider/clusterapi/machine_types.go | 19 +++++++++++ .../clusterapi/zz_generated.deepcopy.go | 5 +++ 5 files changed, 75 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index 2ad30f2bd7..dece1b3c05 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "os" + "strings" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -46,6 +47,7 @@ const ( resourceNameMachine = "machines" resourceNameMachineSet = "machinesets" resourceNameMachineDeployment = "machinedeployments" + failedMachinePrefix = "failed-machine-" ) // machineController watches for Nodes, Machines, MachineSets and @@ -219,6 +221,16 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide } } + if isFailedMachineProviderID(providerID) { + machine, err := c.findMachine(machineKeyFromFailedProviderID(providerID)) + if err != nil { + return nil, err + } + if machine != nil { + return machine.DeepCopy(), nil + } + } + // If the machine object has no providerID--maybe actuator // does not set this value (e.g., OpenStack)--then first // lookup the node using ProviderID. If that is successful @@ -234,6 +246,15 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide return c.findMachine(node.Annotations[machineAnnotationKey]) } +func isFailedMachineProviderID(providerID normalizedProviderID) bool { + return strings.HasPrefix(string(providerID), failedMachinePrefix) +} + +func machineKeyFromFailedProviderID(providerID normalizedProviderID) string { + namespaceName := strings.TrimPrefix(string(providerID), failedMachinePrefix) + return strings.Replace(namespaceName, "_", "/", 1) +} + // findNodeByNodeName finds the Node object keyed by name.. Returns // nil if it cannot be found. A DeepCopy() of the object is returned // on success. @@ -393,6 +414,17 @@ func (c *machineController) machineSetProviderIDs(machineSet *MachineSet) ([]str continue } + if machine.Status.FailureMessage != nil { + klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.Name, *machine.Status.FailureMessage) + // Provide a fake ID to allow the autoscaler to track machines that will never + // become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime. + // Fake ID needs to be recognised later and converted into a machine key. + // Use an underscore as a separator between namespace and name as it is not a + // valid character within a namespace name. + providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name)) + continue + } + if machine.Status.NodeRef == nil { klog.V(4).Infof("Status.NodeRef of machine %q is currently nil", machine.Name) continue diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go index c590738c55..1735dcdad1 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go @@ -450,7 +450,17 @@ func TestControllerFindMachineByProviderID(t *testing.T) { t.Fatalf("expected machines to be equal - expected %+v, got %+v", testConfig.machines[0], machine) } - // Test #2: Verify machine is not found if it has a + // Test #2: Verify machine returned by fake provider ID is correct machine + fakeProviderID := fmt.Sprintf("%s$s/%s", testConfig.machines[0].Namespace, testConfig.machines[0].Name) + machine, err = controller.findMachineByProviderID(normalizedProviderID(fakeProviderID)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if machine != nil { + t.Fatal("expected find to fail") + } + + // Test #3: Verify machine is not found if it has a // non-existent or different provider ID. machine = testConfig.machines[0].DeepCopy() machine.Spec.ProviderID = pointer.StringPtr("does-not-match") diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go index 9d88362559..0b90285280 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_converters.go @@ -115,6 +115,10 @@ func newMachineFromUnstructured(u *unstructured.Unstructured) *Machine { machine.Status.NodeRef = &nodeRef } + if failureMessage, _, _ := unstructured.NestedString(u.Object, "status", "failureMessage"); failureMessage != "" { + machine.Status.FailureMessage = pointer.StringPtr(failureMessage) + } + return &machine } @@ -184,5 +188,9 @@ func newUnstructuredFromMachine(m *Machine) *unstructured.Unstructured { } } + if m.Status.FailureMessage != nil { + unstructured.SetNestedField(u.Object, *m.Status.FailureMessage, "status", "failureMessage") + } + return &u } diff --git a/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go b/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go index 4f4e968ac2..db167b982c 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/machine_types.go @@ -63,6 +63,25 @@ type MachineStatus struct { // NodeRef will point to the corresponding Node if it exists. // +optional NodeRef *corev1.ObjectReference `json:"nodeRef,omitempty"` + + // FailureMessage will be set in the event that there is a terminal problem + // reconciling the Machine and will contain a more verbose string suitable + // for logging and human consumption. + // + // This field should not be set for transitive errors that a controller + // faces that are expected to be fixed automatically over + // time (like service outages), but instead indicate that something is + // fundamentally wrong with the Machine's spec or the configuration of + // the controller, and that manual intervention is required. Examples + // of terminal errors would be invalid combinations of settings in the + // spec, values that are unsupported by the controller, or the + // responsible controller itself being critically misconfigured. + // + // Any transient errors that occur during the reconciliation of Machines + // can be added as events to the Machine object and/or logged in the + // controller's output. + // +optional + FailureMessage *string `json:"failureMessage,omitempty"` } // MachineList contains a list of Machine diff --git a/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go b/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go index 9948f120e8..9b702a8626 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/zz_generated.deepcopy.go @@ -318,6 +318,11 @@ func (in *MachineSpec) DeepCopy() *MachineSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MachineStatus) DeepCopyInto(out *MachineStatus) { *out = *in + if in.FailureMessage != nil { + in, out := &in.FailureMessage, &out.FailureMessage + *out = new(string) + **out = **in + } if in.NodeRef != nil { in, out := &in.NodeRef, &out.NodeRef *out = new(v1.ObjectReference)