GCE: Fix race around route deletion

Because the control-plane can recreate routes, there's a race between
deleting instances and deleting routes.  Add a dependency so we don't
try to delete routes until after we've deleted all the instances.
This commit is contained in:
justinsb 2021-11-14 10:30:31 -05:00
parent 3e683ca9e8
commit cfd4e91a2c
2 changed files with 13 additions and 16 deletions

View File

@ -15,7 +15,6 @@ go_library(
"//upup/pkg/fi/cloudup/gce:go_default_library", "//upup/pkg/fi/cloudup/gce:go_default_library",
"//vendor/google.golang.org/api/compute/v1:go_default_library", "//vendor/google.golang.org/api/compute/v1:go_default_library",
"//vendor/google.golang.org/api/dns/v1:go_default_library", "//vendor/google.golang.org/api/dns/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library",
], ],
) )

View File

@ -23,7 +23,6 @@ import (
compute "google.golang.org/api/compute/v1" compute "google.golang.org/api/compute/v1"
clouddns "google.golang.org/api/dns/v1" clouddns "google.golang.org/api/dns/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2" "k8s.io/klog/v2"
"k8s.io/kops/pkg/dns" "k8s.io/kops/pkg/dns"
"k8s.io/kops/pkg/resources" "k8s.io/kops/pkg/resources"
@ -57,6 +56,8 @@ const maxPrefixTokens = 5
const maxGCERouteNameLength = 63 const maxGCERouteNameLength = 63
func ListResourcesGCE(gceCloud gce.GCECloud, clusterName string, region string) (map[string]*resources.Resource, error) { func ListResourcesGCE(gceCloud gce.GCECloud, clusterName string, region string) (map[string]*resources.Resource, error) {
ctx := context.TODO()
if region == "" { if region == "" {
region = gceCloud.Region() region = gceCloud.Region()
} }
@ -71,7 +72,7 @@ func ListResourcesGCE(gceCloud gce.GCECloud, clusterName string, region string)
{ {
// TODO: Only zones in api.Cluster object, if we have one? // TODO: Only zones in api.Cluster object, if we have one?
gceZones, err := d.gceCloud.Compute().Zones().List(context.Background(), d.gceCloud.Project()) gceZones, err := d.gceCloud.Compute().Zones().List(ctx, d.gceCloud.Project())
if err != nil { if err != nil {
return nil, fmt.Errorf("error listing zones: %v", err) return nil, fmt.Errorf("error listing zones: %v", err)
} }
@ -116,10 +117,8 @@ func ListResourcesGCE(gceCloud gce.GCECloud, clusterName string, region string)
} }
// We try to clean up orphaned routes. // We try to clean up orphaned routes.
// Technically we still have a race condition here - until the master(s) are terminated, they will keep
// creating routes. Another option might be to have a post-destroy cleanup, and only remove routes with no target.
{ {
resourceTrackers, err := d.listRoutes(resources) resourceTrackers, err := d.listRoutes(ctx, resources)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -557,20 +556,18 @@ func deleteFirewallRule(cloud fi.Cloud, r *resources.Resource) error {
return c.WaitForOp(op) return c.WaitForOp(op)
} }
func (d *clusterDiscoveryGCE) listRoutes(resourceMap map[string]*resources.Resource) ([]*resources.Resource, error) { func (d *clusterDiscoveryGCE) listRoutes(ctx context.Context, resourceMap map[string]*resources.Resource) ([]*resources.Resource, error) {
c := d.gceCloud c := d.gceCloud
var resourceTrackers []*resources.Resource var resourceTrackers []*resources.Resource
instances := sets.NewString() instancesToDelete := make(map[string]*resources.Resource)
for _, resource := range resourceMap { for _, resource := range resourceMap {
if resource.Type == typeInstance { if resource.Type == typeInstance {
instances.Insert(resource.ID) instancesToDelete[resource.ID] = resource
} }
} }
ctx := context.Background()
// TODO: Push-down prefix? // TODO: Push-down prefix?
routes, err := c.Compute().Routes().List(ctx, c.Project()) routes, err := c.Compute().Routes().List(ctx, c.Project())
if err != nil { if err != nil {
@ -596,7 +593,7 @@ func (d *clusterDiscoveryGCE) listRoutes(resourceMap map[string]*resources.Resou
klog.Warningf("error parsing URL for NextHopInstance=%q", r.NextHopInstance) klog.Warningf("error parsing URL for NextHopInstance=%q", r.NextHopInstance)
} }
if instances.Has(u.Zone + "/" + u.Name) { if instancesToDelete[u.Zone+"/"+u.Name] != nil {
remove = true remove = true
} }
} }
@ -610,10 +607,11 @@ func (d *clusterDiscoveryGCE) listRoutes(resourceMap map[string]*resources.Resou
Obj: r, Obj: r,
} }
// We don't need to block // To avoid race conditions where the control-plane re-adds the routes, we delete routes
//if r.NextHopInstance != "" { // only after we have deleted all the instances.
// resourceTracker.Blocked = append(resourceTracker.Blocks, typeInstance+":"+gce.LastComponent(r.NextHopInstance)) for _, instance := range instancesToDelete {
//} resourceTracker.Blocked = append(resourceTracker.Blocked, typeInstance+":"+instance.ID)
}
klog.V(4).Infof("Found resource: %s", r.SelfLink) klog.V(4).Infof("Found resource: %s", r.SelfLink)
resourceTrackers = append(resourceTrackers, resourceTracker) resourceTrackers = append(resourceTrackers, resourceTracker)