From 65dd16a30195ff51edab37f223af400402b2439f Mon Sep 17 00:00:00 2001 From: ederst Date: Thu, 6 Apr 2023 22:33:28 +0200 Subject: [PATCH] OpenStack: Use task engine to retry failed servers This will make use of the kOps taks engine to retry failed servers. The former approach had the side effect of not making kOps fail when the last retry failed: Because there is now a server present - although in an erroneous state - the "instance task" which the task engine retried reconciled the server (port, floating ip) instead of recreating it again. With the approach of letting the task engine retry the failed servers this will be handled correctly. --- upup/pkg/fi/cloudup/openstack/instance.go | 6 +----- upup/pkg/fi/cloudup/openstacktasks/instance.go | 14 +++++++++++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/upup/pkg/fi/cloudup/openstack/instance.go b/upup/pkg/fi/cloudup/openstack/instance.go index c8a98bf936..7af275c953 100644 --- a/upup/pkg/fi/cloudup/openstack/instance.go +++ b/upup/pkg/fi/cloudup/openstack/instance.go @@ -101,10 +101,6 @@ func createInstance(c OpenstackCloud, opt servers.CreateOptsBuilder, portID stri var server *servers.Server done, err := vfs.RetryWithBackoff(writeBackoff, func() (bool, error) { - if server != nil { - // Note: this will delete the server from the last try, even if it is now ACTIVE or still in BUILD state - c.DeleteInstanceWithID(server.ID) - } v, err := servers.Create(c.ComputeClient(), opt).Extract() if err != nil { @@ -131,7 +127,7 @@ func createInstance(c OpenstackCloud, opt servers.CreateOptsBuilder, portID stri err = waitForStatusActive(c, server.ID, nil) if err != nil { - return false, fmt.Errorf("error while waiting for server '%s' to become '%s': %v", server.ID, activeStatus, err) + return true, err } return true, nil diff --git a/upup/pkg/fi/cloudup/openstacktasks/instance.go b/upup/pkg/fi/cloudup/openstacktasks/instance.go index 3ebd49deb8..c47a79d9be 100644 --- a/upup/pkg/fi/cloudup/openstacktasks/instance.go +++ b/upup/pkg/fi/cloudup/openstacktasks/instance.go @@ -52,6 +52,7 @@ type Instance struct { SecurityGroups []string FloatingIP *FloatingIP ConfigDrive *bool + Status *string Lifecycle fi.Lifecycle ForAPIServer bool @@ -198,6 +199,7 @@ func (e *Instance) Find(c *fi.CloudupContext) (*Instance, error) { AvailabilityZone: e.AvailabilityZone, GroupName: e.GroupName, ConfigDrive: e.ConfigDrive, + Status: fi.PtrTo(server.Status), } ports, err := cloud.ListPorts(ports.ListOpts{ @@ -244,6 +246,7 @@ func (e *Instance) Find(c *fi.CloudupContext) (*Instance, error) { // Avoid flapping e.ID = actual.ID + e.Status = fi.PtrTo(activeStatus) actual.ForAPIServer = e.ForAPIServer // Immutable fields @@ -281,6 +284,9 @@ func (_ *Instance) ShouldCreate(a, e, changes *Instance) (bool, error) { if a == nil { return true, nil } + if fi.ValueOf(a.Status) == errorStatus { + return true, nil + } if changes.Port != nil { return true, nil } @@ -309,7 +315,13 @@ func generateInstanceName(e *Instance) (string, error) { func (_ *Instance) RenderOpenstack(t *openstack.OpenstackAPITarget, a, e, changes *Instance) error { cloud := t.Cloud - if a == nil { + + if a != nil && fi.ValueOf(a.Status) == errorStatus { + klog.V(2).Infof("Delete previously failed server: %s\n", fi.ValueOf(a.ID)) + cloud.DeleteInstanceWithID(fi.ValueOf(a.ID)) + } + + if a == nil || fi.ValueOf(a.Status) == errorStatus { serverName, err := generateInstanceName(e) if err != nil { return err