From 55634011c48af4bdaffa43a482151e9661efcd4d Mon Sep 17 00:00:00 2001 From: chaodaiG <45011425+chaodaiG@users.noreply.github.com> Date: Fri, 13 Sep 2019 11:05:35 -0700 Subject: [PATCH] Add cluster deletion funciton (#674) * Add cluster deletion funciton * Apply suggestions from code review Co-Authored-By: Adriano Cunha <35786489+adrcunha@users.noreply.github.com> --- testutils/clustermanager/boskos/fake/fake.go | 25 ++- testutils/clustermanager/client.go | 1 + testutils/clustermanager/gke.go | 74 ++++++- testutils/clustermanager/gke_test.go | 194 ++++++++++++++++++- 4 files changed, 269 insertions(+), 25 deletions(-) diff --git a/testutils/clustermanager/boskos/fake/fake.go b/testutils/clustermanager/boskos/fake/fake.go index 9f3ffb864..42b1a95b6 100644 --- a/testutils/clustermanager/boskos/fake/fake.go +++ b/testutils/clustermanager/boskos/fake/fake.go @@ -23,16 +23,32 @@ import ( "knative.dev/pkg/testutils/clustermanager/boskos" ) +const ( + fakeOwner = "fake-owner" +) + // FakeBoskosClient implements boskos.Operation type FakeBoskosClient struct { resources []*boskoscommon.Resource } +func (c *FakeBoskosClient) getOwner(host *string) string { + if nil == host { + return fakeOwner + } + return *host +} + +func (c *FakeBoskosClient) GetResources() []*boskoscommon.Resource { + return c.resources +} + // AcquireGKEProject fakes to be no op func (c *FakeBoskosClient) AcquireGKEProject(host *string) (*boskoscommon.Resource, error) { for _, res := range c.resources { if res.State == boskoscommon.Free { res.State = boskoscommon.Busy + res.Owner = c.getOwner(host) return res, nil } } @@ -41,18 +57,15 @@ func (c *FakeBoskosClient) AcquireGKEProject(host *string) (*boskoscommon.Resour // ReleaseGKEProject fakes to be no op func (c *FakeBoskosClient) ReleaseGKEProject(host *string, name string) error { - if nil == host { - return fmt.Errorf("host has to be set") - } - + owner := c.getOwner(host) for _, res := range c.resources { if res.Name == name { - if res.Owner == *host { + if res.Owner == owner { res.Owner = "" res.State = boskoscommon.Free return nil } else { - return fmt.Errorf("Got owner: '%s', expect owner: '%s'", res.Owner, *host) + return fmt.Errorf("Got owner: '%s', expect owner: '%s'", res.Owner, owner) } } } diff --git a/testutils/clustermanager/client.go b/testutils/clustermanager/client.go index 803b651bf..42bbec8d0 100644 --- a/testutils/clustermanager/client.go +++ b/testutils/clustermanager/client.go @@ -26,4 +26,5 @@ type ClusterOperations interface { Provider() string Initialize() error Acquire() error + Delete() error } diff --git a/testutils/clustermanager/gke.go b/testutils/clustermanager/gke.go index a0b04129d..dfdd23a5e 100644 --- a/testutils/clustermanager/gke.go +++ b/testutils/clustermanager/gke.go @@ -46,6 +46,7 @@ var ( protectedClusters = []string{"knative-prow"} // These are arbitrary numbers determined based on past experience creationTimeout = 20 * time.Minute + deletionTimeout = 10 * time.Minute ) // GKEClient implements Client @@ -77,6 +78,7 @@ type GKECluster struct { // GKESDKOperations wraps GKE SDK related functions type GKESDKOperations interface { create(string, string, *container.CreateClusterRequest) (*container.Operation, error) + delete(string, string, string) (*container.Operation, error) get(string, string, string) (*container.Cluster, error) getOperation(string, string, string) (*container.Operation, error) } @@ -91,6 +93,12 @@ func (gsc *GKESDKClient) create(project, location string, rb *container.CreateCl return gsc.Projects.Locations.Clusters.Create(parent, rb).Context(context.Background()).Do() } +// delete deletes GKE cluster and waits until completion +func (gsc *GKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) { + parent := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, clusterName) + return gsc.Projects.Locations.Clusters.Delete(parent).Context(context.Background()).Do() +} + func (gsc *GKESDKClient) get(project, location, cluster string) (*container.Cluster, error) { clusterFullPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, cluster) return gsc.Projects.Locations.Clusters.Get(clusterFullPath).Context(context.Background()).Do() @@ -222,7 +230,10 @@ func (gc *GKECluster) Acquire() error { } } var cluster *container.Cluster + var op *container.Operation for i, region := range regions { + // Restore innocence + err = nil rb := &container.CreateClusterRequest{ Cluster: &container.Cluster{ Name: clusterName, @@ -235,24 +246,36 @@ func (gc *GKECluster) Acquire() error { } clusterLoc := getClusterLocation(region, gc.Request.Zone) - // TODO(chaodaiG): add deleting logic once cluster deletion logic is done - log.Printf("Creating cluster %q' in %q", clusterName, clusterLoc) - var createOp *container.Operation - createOp, err = gc.operations.create(*gc.Project, clusterLoc, rb) + // Deleting cluster if it already exists + existingCluster, _ := gc.operations.get(*gc.Project, clusterLoc, clusterName) + if nil != existingCluster { + log.Printf("Cluster %q already exists in %q. Deleting...", clusterName, clusterLoc) + op, err = gc.operations.delete(*gc.Project, clusterName, clusterLoc) + if nil == err { + err = gc.wait(clusterLoc, op.Name, deletionTimeout) + } + } + // Creating cluster only if previous step succeeded if nil == err { - if err = gc.wait(clusterLoc, createOp.Name, creationTimeout); nil == err { - cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name) + log.Printf("Creating cluster %q in %q", clusterName, clusterLoc) + op, err = gc.operations.create(*gc.Project, clusterLoc, rb) + if nil == err { + if err = gc.wait(clusterLoc, op.Name, creationTimeout); nil == err { + cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name) + } } } if nil != err { - errMsg := fmt.Sprintf("error creating cluster: '%v'", err) + errMsg := fmt.Sprintf("Error during cluster creation: '%v'. ", err) if gc.NeedCleanup { // Delete half created cluster if it's user created - // TODO(chaodaiG): add this part when deletion logic is done + errMsg = fmt.Sprintf("%sDeleting cluster %q in %q in background...\n", errMsg, clusterName, clusterLoc) + go gc.operations.delete(*gc.Project, clusterName, clusterLoc) } + // Retry another region if cluster creation failed. // TODO(chaodaiG): catch specific errors as we know what the error look like for stockout etc. if len(regions) != i+1 { - errMsg = fmt.Sprintf("%s. Retry another region '%s' for cluster creation", errMsg, regions[i+1]) + errMsg = fmt.Sprintf("%sRetry another region %q for cluster creation", errMsg, regions[i+1]) } log.Printf(errMsg) } else { @@ -265,6 +288,39 @@ func (gc *GKECluster) Acquire() error { return err } +// Delete takes care of GKE cluster resource cleanup. It only release Boskos resource if running in +// Prow, otherwise deletes the cluster if marked NeedsCleanup +func (gc *GKECluster) Delete() error { + gc.ensureProtected() + // Release Boskos if running in Prow, will let Janitor taking care of + // clusters deleting + if common.IsProw() { + log.Printf("Releasing Boskos resource: '%v'", *gc.Project) + return gc.boskosOps.ReleaseGKEProject(nil, *gc.Project) + } + + // NeedCleanup is only true if running locally and cluster created by the + // process + if !gc.NeedCleanup { + return nil + } + // Should only get here if running locally and cluster created by this + // client, so at this moment cluster should have been set + if nil == gc.Cluster { + return fmt.Errorf("cluster doesn't exist") + } + + log.Printf("Deleting cluster %q in %q", gc.Cluster.Name, gc.Cluster.Location) + op, err := gc.operations.delete(*gc.Project, gc.Cluster.Name, gc.Cluster.Location) + if nil == err { + err = gc.wait(gc.Cluster.Location, op.Name, deletionTimeout) + } + if nil != err { + return fmt.Errorf("failed deleting cluster: '%v'", err) + } + return nil +} + // wait depends on unique opName(operation ID created by cloud), and waits until // it's done func (gc *GKECluster) wait(location, opName string, wait time.Duration) error { diff --git a/testutils/clustermanager/gke_test.go b/testutils/clustermanager/gke_test.go index eab7a31e6..8fad9958c 100644 --- a/testutils/clustermanager/gke_test.go +++ b/testutils/clustermanager/gke_test.go @@ -17,16 +17,19 @@ limitations under the License. package clustermanager import ( + "errors" "fmt" "io/ioutil" "os" "reflect" + "strconv" "strings" "testing" "time" "google.golang.org/api/container/v1" + boskoscommon "k8s.io/test-infra/boskos/common" boskosFake "knative.dev/pkg/testutils/clustermanager/boskos/fake" "knative.dev/pkg/testutils/common" ) @@ -67,7 +70,7 @@ func newFakeGKESDKClient() *FakeGKESDKClient { // fgsc.opStatus by fgsc.opStatus[string(fgsc.opNumber+1)]="PENDING" to make the // next operation pending func (fgsc *FakeGKESDKClient) newOp() *container.Operation { - opName := string(fgsc.opNumber) + opName := strconv.Itoa(fgsc.opNumber) op := &container.Operation{ Name: opName, Status: "DONE", @@ -87,7 +90,7 @@ func (fgsc *FakeGKESDKClient) create(project, location string, rb *container.Cre if cls, ok := fgsc.clusters[parent]; ok { for _, cl := range cls { if cl.Name == name { - return nil, fmt.Errorf("cluster already exist") + return nil, errors.New("cluster already exist") } } } else { @@ -103,6 +106,24 @@ func (fgsc *FakeGKESDKClient) create(project, location string, rb *container.Cre return fgsc.newOp(), nil } +func (fgsc *FakeGKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) { + parent := fmt.Sprintf("projects/%s/locations/%s", project, location) + found := -1 + if clusters, ok := fgsc.clusters[parent]; ok { + for i, cluster := range clusters { + if cluster.Name == clusterName { + found = i + } + } + } + if found == -1 { + return nil, fmt.Errorf("cluster %q not found for deletion", clusterName) + } + // Delete this cluster + fgsc.clusters[parent] = append(fgsc.clusters[parent][:found], fgsc.clusters[parent][found+1:]...) + return fgsc.newOp(), nil +} + func (fgsc *FakeGKESDKClient) get(project, location, cluster string) (*container.Cluster, error) { parent := fmt.Sprintf("projects/%s/locations/%s", project, location) if cls, ok := fgsc.clusters[parent]; ok { @@ -478,6 +499,7 @@ func TestAcquire(t *testing.T) { fakeBuildID := "1234" datas := []struct { existCluster *container.Cluster + kubeconfigSet bool nextOpStatus []string expClusterName string expClusterLocation string @@ -488,20 +510,33 @@ func TestAcquire(t *testing.T) { &container.Cluster{ Name: "customcluster", Location: "us-central1", - }, []string{}, "customcluster", "us-central1", nil, + }, true, []string{}, "customcluster", "us-central1", nil, + }, { + // cluster exists but not set in kubeconfig, cluster will be deleted + // then created + &container.Cluster{ + Name: fakeClusterName, + Location: "us-central1", + }, false, []string{}, fakeClusterName, "us-central1", nil, + }, { + // cluster exists but not set in kubeconfig, cluster deletion + // failed, will recreate in us-west1 + &container.Cluster{ + Name: fakeClusterName, + Location: "us-central1", + }, false, []string{"BAD"}, fakeClusterName, "us-west1", nil, }, { // cluster creation succeeded - nil, []string{}, fakeClusterName, "us-central1", nil, + nil, false, []string{}, fakeClusterName, "us-central1", nil, }, { // cluster creation succeeded retry - nil, []string{"PENDING"}, fakeClusterName, "us-west1", nil, + nil, false, []string{"PENDING"}, fakeClusterName, "us-west1", nil, }, { // cluster creation failed all retry - nil, []string{"PENDING", "PENDING", "PENDING"}, - "", "", fmt.Errorf("timed out waiting"), + nil, false, []string{"PENDING", "PENDING", "PENDING"}, "", "", fmt.Errorf("timed out waiting"), }, { // cluster creation went bad state - nil, []string{"BAD", "BAD", "BAD"}, "", "", fmt.Errorf("unexpected operation status: %q", "BAD"), + nil, false, []string{"BAD", "BAD", "BAD"}, "", "", fmt.Errorf("unexpected operation status: %q", "BAD"), }, } @@ -528,12 +563,22 @@ func TestAcquire(t *testing.T) { return oldFunc(key) } fgc := setupFakeGKECluster() + opCount := 0 if nil != data.existCluster { - fgc.Cluster = data.existCluster + opCount++ + fgc.operations.create(fakeProj, data.existCluster.Location, &container.CreateClusterRequest{ + Cluster: &container.Cluster{ + Name: data.existCluster.Name, + }, + ProjectId: fakeProj, + }) + if data.kubeconfigSet { + fgc.Cluster = data.existCluster + } } fgc.Project = &fakeProj for i, status := range data.nextOpStatus { - fgc.operations.(*FakeGKESDKClient).opStatus[string(i)] = status + fgc.operations.(*FakeGKESDKClient).opStatus[strconv.Itoa(opCount+i)] = status } fgc.Request = &GKERequest{ @@ -543,6 +588,9 @@ func TestAcquire(t *testing.T) { Zone: "", BackupRegions: DefaultGKEBackupRegions, } + // Set NeedCleanup to false for easier testing, as it launches a + // goroutine + fgc.NeedCleanup = false err := fgc.Acquire() var gotName, gotLocation string if nil != fgc.Cluster { @@ -555,3 +603,129 @@ func TestAcquire(t *testing.T) { } } } + +func TestDelete(t *testing.T) { + datas := []struct { + isProw bool + needCleanup bool + boskosState []*boskoscommon.Resource + cluster *container.Cluster + expBoskos []*boskoscommon.Resource + expCluster *container.Cluster + expErr error + }{ + { + // Not in prow, NeedCleanup is false + false, + false, + []*boskoscommon.Resource{}, + &container.Cluster{ + Name: "customcluster", + Location: "us-central1", + }, + nil, + &container.Cluster{ + Name: "customcluster", + Location: "us-central1", + Status: "RUNNING", + }, + nil, + }, { + // Not in prow, NeedCleanup is true + false, + true, + []*boskoscommon.Resource{}, + &container.Cluster{ + Name: "customcluster", + Location: "us-central1", + }, + nil, + nil, + nil, + }, { + // Not in prow, NeedCleanup is true, but cluster doesn't exist + false, + true, + []*boskoscommon.Resource{}, + nil, + nil, + nil, + fmt.Errorf("cluster doesn't exist"), + }, { + // In prow, only need to release boskos + true, + true, + []*boskoscommon.Resource{&boskoscommon.Resource{ + Name: fakeProj, + }}, + &container.Cluster{ + Name: "customcluster", + Location: "us-central1", + }, + []*boskoscommon.Resource{&boskoscommon.Resource{ + Type: "gke-project", + Name: fakeProj, + State: boskoscommon.Free, + }}, + &container.Cluster{ + Name: "customcluster", + Location: "us-central1", + Status: "RUNNING", + }, + nil, + }, + } + + // mock GetOSEnv for testing + oldFunc := common.GetOSEnv + // mock timeout so it doesn't run forever + oldTimeout := creationTimeout + creationTimeout = 100 * time.Millisecond + defer func() { + // restore + common.GetOSEnv = oldFunc + creationTimeout = oldTimeout + }() + + for _, data := range datas { + common.GetOSEnv = func(key string) string { + switch key { + case "PROW_JOB_ID": // needed to mock IsProw() + if data.isProw { + return "fake_job_id" + } + return "" + } + return oldFunc(key) + } + fgc := setupFakeGKECluster() + fgc.Project = &fakeProj + fgc.NeedCleanup = data.needCleanup + if nil != data.cluster { + fgc.operations.create(fakeProj, data.cluster.Location, &container.CreateClusterRequest{ + Cluster: &container.Cluster{ + Name: data.cluster.Name, + }, + ProjectId: fakeProj, + }) + fgc.Cluster = data.cluster + } + // Set up fake boskos + for _, bos := range data.boskosState { + fgc.boskosOps.(*boskosFake.FakeBoskosClient).NewGKEProject(bos.Name) + // Acquire with default user + fgc.boskosOps.(*boskosFake.FakeBoskosClient).AcquireGKEProject(nil) + } + + err := fgc.Delete() + var clusterGot *container.Cluster + if nil != data.cluster { + clusterGot, _ = fgc.operations.get(fakeProj, data.cluster.Location, data.cluster.Name) + } + gotBoskos := fgc.boskosOps.(*boskosFake.FakeBoskosClient).GetResources() + if !reflect.DeepEqual(err, data.expErr) || !reflect.DeepEqual(clusterGot, data.expCluster) || !reflect.DeepEqual(gotBoskos, data.expBoskos) { + t.Errorf("testing deleting cluster, with:\n\tIs Prow: '%v'\n\texisting cluster: '%v'\n\tboskos state: '%v'\nwant: boskos - '%v', cluster - '%v', err - '%v'\ngot: boskos - '%v', cluster - '%v', err - '%v'", + data.isProw, data.cluster, data.boskosState, data.expBoskos, data.expCluster, data.expErr, nil, clusterGot, err) + } + } +}