Add cluster deletion funciton (#674)

* Add cluster deletion funciton

* Apply suggestions from code review

Co-Authored-By: Adriano Cunha <35786489+adrcunha@users.noreply.github.com>
This commit is contained in:
chaodaiG 2019-09-13 11:05:35 -07:00 committed by Knative Prow Robot
parent 1bea34570b
commit 55634011c4
4 changed files with 269 additions and 25 deletions

View File

@ -23,16 +23,32 @@ import (
"knative.dev/pkg/testutils/clustermanager/boskos"
)
const (
fakeOwner = "fake-owner"
)
// FakeBoskosClient implements boskos.Operation
type FakeBoskosClient struct {
resources []*boskoscommon.Resource
}
func (c *FakeBoskosClient) getOwner(host *string) string {
if nil == host {
return fakeOwner
}
return *host
}
func (c *FakeBoskosClient) GetResources() []*boskoscommon.Resource {
return c.resources
}
// AcquireGKEProject fakes to be no op
func (c *FakeBoskosClient) AcquireGKEProject(host *string) (*boskoscommon.Resource, error) {
for _, res := range c.resources {
if res.State == boskoscommon.Free {
res.State = boskoscommon.Busy
res.Owner = c.getOwner(host)
return res, nil
}
}
@ -41,18 +57,15 @@ func (c *FakeBoskosClient) AcquireGKEProject(host *string) (*boskoscommon.Resour
// ReleaseGKEProject fakes to be no op
func (c *FakeBoskosClient) ReleaseGKEProject(host *string, name string) error {
if nil == host {
return fmt.Errorf("host has to be set")
}
owner := c.getOwner(host)
for _, res := range c.resources {
if res.Name == name {
if res.Owner == *host {
if res.Owner == owner {
res.Owner = ""
res.State = boskoscommon.Free
return nil
} else {
return fmt.Errorf("Got owner: '%s', expect owner: '%s'", res.Owner, *host)
return fmt.Errorf("Got owner: '%s', expect owner: '%s'", res.Owner, owner)
}
}
}

View File

@ -26,4 +26,5 @@ type ClusterOperations interface {
Provider() string
Initialize() error
Acquire() error
Delete() error
}

View File

@ -46,6 +46,7 @@ var (
protectedClusters = []string{"knative-prow"}
// These are arbitrary numbers determined based on past experience
creationTimeout = 20 * time.Minute
deletionTimeout = 10 * time.Minute
)
// GKEClient implements Client
@ -77,6 +78,7 @@ type GKECluster struct {
// GKESDKOperations wraps GKE SDK related functions
type GKESDKOperations interface {
create(string, string, *container.CreateClusterRequest) (*container.Operation, error)
delete(string, string, string) (*container.Operation, error)
get(string, string, string) (*container.Cluster, error)
getOperation(string, string, string) (*container.Operation, error)
}
@ -91,6 +93,12 @@ func (gsc *GKESDKClient) create(project, location string, rb *container.CreateCl
return gsc.Projects.Locations.Clusters.Create(parent, rb).Context(context.Background()).Do()
}
// delete deletes GKE cluster and waits until completion
func (gsc *GKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) {
parent := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, clusterName)
return gsc.Projects.Locations.Clusters.Delete(parent).Context(context.Background()).Do()
}
func (gsc *GKESDKClient) get(project, location, cluster string) (*container.Cluster, error) {
clusterFullPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, cluster)
return gsc.Projects.Locations.Clusters.Get(clusterFullPath).Context(context.Background()).Do()
@ -222,7 +230,10 @@ func (gc *GKECluster) Acquire() error {
}
}
var cluster *container.Cluster
var op *container.Operation
for i, region := range regions {
// Restore innocence
err = nil
rb := &container.CreateClusterRequest{
Cluster: &container.Cluster{
Name: clusterName,
@ -235,24 +246,36 @@ func (gc *GKECluster) Acquire() error {
}
clusterLoc := getClusterLocation(region, gc.Request.Zone)
// TODO(chaodaiG): add deleting logic once cluster deletion logic is done
log.Printf("Creating cluster %q' in %q", clusterName, clusterLoc)
var createOp *container.Operation
createOp, err = gc.operations.create(*gc.Project, clusterLoc, rb)
// Deleting cluster if it already exists
existingCluster, _ := gc.operations.get(*gc.Project, clusterLoc, clusterName)
if nil != existingCluster {
log.Printf("Cluster %q already exists in %q. Deleting...", clusterName, clusterLoc)
op, err = gc.operations.delete(*gc.Project, clusterName, clusterLoc)
if nil == err {
err = gc.wait(clusterLoc, op.Name, deletionTimeout)
}
}
// Creating cluster only if previous step succeeded
if nil == err {
if err = gc.wait(clusterLoc, createOp.Name, creationTimeout); nil == err {
cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name)
log.Printf("Creating cluster %q in %q", clusterName, clusterLoc)
op, err = gc.operations.create(*gc.Project, clusterLoc, rb)
if nil == err {
if err = gc.wait(clusterLoc, op.Name, creationTimeout); nil == err {
cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name)
}
}
}
if nil != err {
errMsg := fmt.Sprintf("error creating cluster: '%v'", err)
errMsg := fmt.Sprintf("Error during cluster creation: '%v'. ", err)
if gc.NeedCleanup { // Delete half created cluster if it's user created
// TODO(chaodaiG): add this part when deletion logic is done
errMsg = fmt.Sprintf("%sDeleting cluster %q in %q in background...\n", errMsg, clusterName, clusterLoc)
go gc.operations.delete(*gc.Project, clusterName, clusterLoc)
}
// Retry another region if cluster creation failed.
// TODO(chaodaiG): catch specific errors as we know what the error look like for stockout etc.
if len(regions) != i+1 {
errMsg = fmt.Sprintf("%s. Retry another region '%s' for cluster creation", errMsg, regions[i+1])
errMsg = fmt.Sprintf("%sRetry another region %q for cluster creation", errMsg, regions[i+1])
}
log.Printf(errMsg)
} else {
@ -265,6 +288,39 @@ func (gc *GKECluster) Acquire() error {
return err
}
// Delete takes care of GKE cluster resource cleanup. It only release Boskos resource if running in
// Prow, otherwise deletes the cluster if marked NeedsCleanup
func (gc *GKECluster) Delete() error {
gc.ensureProtected()
// Release Boskos if running in Prow, will let Janitor taking care of
// clusters deleting
if common.IsProw() {
log.Printf("Releasing Boskos resource: '%v'", *gc.Project)
return gc.boskosOps.ReleaseGKEProject(nil, *gc.Project)
}
// NeedCleanup is only true if running locally and cluster created by the
// process
if !gc.NeedCleanup {
return nil
}
// Should only get here if running locally and cluster created by this
// client, so at this moment cluster should have been set
if nil == gc.Cluster {
return fmt.Errorf("cluster doesn't exist")
}
log.Printf("Deleting cluster %q in %q", gc.Cluster.Name, gc.Cluster.Location)
op, err := gc.operations.delete(*gc.Project, gc.Cluster.Name, gc.Cluster.Location)
if nil == err {
err = gc.wait(gc.Cluster.Location, op.Name, deletionTimeout)
}
if nil != err {
return fmt.Errorf("failed deleting cluster: '%v'", err)
}
return nil
}
// wait depends on unique opName(operation ID created by cloud), and waits until
// it's done
func (gc *GKECluster) wait(location, opName string, wait time.Duration) error {

View File

@ -17,16 +17,19 @@ limitations under the License.
package clustermanager
import (
"errors"
"fmt"
"io/ioutil"
"os"
"reflect"
"strconv"
"strings"
"testing"
"time"
"google.golang.org/api/container/v1"
boskoscommon "k8s.io/test-infra/boskos/common"
boskosFake "knative.dev/pkg/testutils/clustermanager/boskos/fake"
"knative.dev/pkg/testutils/common"
)
@ -67,7 +70,7 @@ func newFakeGKESDKClient() *FakeGKESDKClient {
// fgsc.opStatus by fgsc.opStatus[string(fgsc.opNumber+1)]="PENDING" to make the
// next operation pending
func (fgsc *FakeGKESDKClient) newOp() *container.Operation {
opName := string(fgsc.opNumber)
opName := strconv.Itoa(fgsc.opNumber)
op := &container.Operation{
Name: opName,
Status: "DONE",
@ -87,7 +90,7 @@ func (fgsc *FakeGKESDKClient) create(project, location string, rb *container.Cre
if cls, ok := fgsc.clusters[parent]; ok {
for _, cl := range cls {
if cl.Name == name {
return nil, fmt.Errorf("cluster already exist")
return nil, errors.New("cluster already exist")
}
}
} else {
@ -103,6 +106,24 @@ func (fgsc *FakeGKESDKClient) create(project, location string, rb *container.Cre
return fgsc.newOp(), nil
}
func (fgsc *FakeGKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) {
parent := fmt.Sprintf("projects/%s/locations/%s", project, location)
found := -1
if clusters, ok := fgsc.clusters[parent]; ok {
for i, cluster := range clusters {
if cluster.Name == clusterName {
found = i
}
}
}
if found == -1 {
return nil, fmt.Errorf("cluster %q not found for deletion", clusterName)
}
// Delete this cluster
fgsc.clusters[parent] = append(fgsc.clusters[parent][:found], fgsc.clusters[parent][found+1:]...)
return fgsc.newOp(), nil
}
func (fgsc *FakeGKESDKClient) get(project, location, cluster string) (*container.Cluster, error) {
parent := fmt.Sprintf("projects/%s/locations/%s", project, location)
if cls, ok := fgsc.clusters[parent]; ok {
@ -478,6 +499,7 @@ func TestAcquire(t *testing.T) {
fakeBuildID := "1234"
datas := []struct {
existCluster *container.Cluster
kubeconfigSet bool
nextOpStatus []string
expClusterName string
expClusterLocation string
@ -488,20 +510,33 @@ func TestAcquire(t *testing.T) {
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
}, []string{}, "customcluster", "us-central1", nil,
}, true, []string{}, "customcluster", "us-central1", nil,
}, {
// cluster exists but not set in kubeconfig, cluster will be deleted
// then created
&container.Cluster{
Name: fakeClusterName,
Location: "us-central1",
}, false, []string{}, fakeClusterName, "us-central1", nil,
}, {
// cluster exists but not set in kubeconfig, cluster deletion
// failed, will recreate in us-west1
&container.Cluster{
Name: fakeClusterName,
Location: "us-central1",
}, false, []string{"BAD"}, fakeClusterName, "us-west1", nil,
}, {
// cluster creation succeeded
nil, []string{}, fakeClusterName, "us-central1", nil,
nil, false, []string{}, fakeClusterName, "us-central1", nil,
}, {
// cluster creation succeeded retry
nil, []string{"PENDING"}, fakeClusterName, "us-west1", nil,
nil, false, []string{"PENDING"}, fakeClusterName, "us-west1", nil,
}, {
// cluster creation failed all retry
nil, []string{"PENDING", "PENDING", "PENDING"},
"", "", fmt.Errorf("timed out waiting"),
nil, false, []string{"PENDING", "PENDING", "PENDING"}, "", "", fmt.Errorf("timed out waiting"),
}, {
// cluster creation went bad state
nil, []string{"BAD", "BAD", "BAD"}, "", "", fmt.Errorf("unexpected operation status: %q", "BAD"),
nil, false, []string{"BAD", "BAD", "BAD"}, "", "", fmt.Errorf("unexpected operation status: %q", "BAD"),
},
}
@ -528,12 +563,22 @@ func TestAcquire(t *testing.T) {
return oldFunc(key)
}
fgc := setupFakeGKECluster()
opCount := 0
if nil != data.existCluster {
fgc.Cluster = data.existCluster
opCount++
fgc.operations.create(fakeProj, data.existCluster.Location, &container.CreateClusterRequest{
Cluster: &container.Cluster{
Name: data.existCluster.Name,
},
ProjectId: fakeProj,
})
if data.kubeconfigSet {
fgc.Cluster = data.existCluster
}
}
fgc.Project = &fakeProj
for i, status := range data.nextOpStatus {
fgc.operations.(*FakeGKESDKClient).opStatus[string(i)] = status
fgc.operations.(*FakeGKESDKClient).opStatus[strconv.Itoa(opCount+i)] = status
}
fgc.Request = &GKERequest{
@ -543,6 +588,9 @@ func TestAcquire(t *testing.T) {
Zone: "",
BackupRegions: DefaultGKEBackupRegions,
}
// Set NeedCleanup to false for easier testing, as it launches a
// goroutine
fgc.NeedCleanup = false
err := fgc.Acquire()
var gotName, gotLocation string
if nil != fgc.Cluster {
@ -555,3 +603,129 @@ func TestAcquire(t *testing.T) {
}
}
}
func TestDelete(t *testing.T) {
datas := []struct {
isProw bool
needCleanup bool
boskosState []*boskoscommon.Resource
cluster *container.Cluster
expBoskos []*boskoscommon.Resource
expCluster *container.Cluster
expErr error
}{
{
// Not in prow, NeedCleanup is false
false,
false,
[]*boskoscommon.Resource{},
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
},
nil,
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
Status: "RUNNING",
},
nil,
}, {
// Not in prow, NeedCleanup is true
false,
true,
[]*boskoscommon.Resource{},
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
},
nil,
nil,
nil,
}, {
// Not in prow, NeedCleanup is true, but cluster doesn't exist
false,
true,
[]*boskoscommon.Resource{},
nil,
nil,
nil,
fmt.Errorf("cluster doesn't exist"),
}, {
// In prow, only need to release boskos
true,
true,
[]*boskoscommon.Resource{&boskoscommon.Resource{
Name: fakeProj,
}},
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
},
[]*boskoscommon.Resource{&boskoscommon.Resource{
Type: "gke-project",
Name: fakeProj,
State: boskoscommon.Free,
}},
&container.Cluster{
Name: "customcluster",
Location: "us-central1",
Status: "RUNNING",
},
nil,
},
}
// mock GetOSEnv for testing
oldFunc := common.GetOSEnv
// mock timeout so it doesn't run forever
oldTimeout := creationTimeout
creationTimeout = 100 * time.Millisecond
defer func() {
// restore
common.GetOSEnv = oldFunc
creationTimeout = oldTimeout
}()
for _, data := range datas {
common.GetOSEnv = func(key string) string {
switch key {
case "PROW_JOB_ID": // needed to mock IsProw()
if data.isProw {
return "fake_job_id"
}
return ""
}
return oldFunc(key)
}
fgc := setupFakeGKECluster()
fgc.Project = &fakeProj
fgc.NeedCleanup = data.needCleanup
if nil != data.cluster {
fgc.operations.create(fakeProj, data.cluster.Location, &container.CreateClusterRequest{
Cluster: &container.Cluster{
Name: data.cluster.Name,
},
ProjectId: fakeProj,
})
fgc.Cluster = data.cluster
}
// Set up fake boskos
for _, bos := range data.boskosState {
fgc.boskosOps.(*boskosFake.FakeBoskosClient).NewGKEProject(bos.Name)
// Acquire with default user
fgc.boskosOps.(*boskosFake.FakeBoskosClient).AcquireGKEProject(nil)
}
err := fgc.Delete()
var clusterGot *container.Cluster
if nil != data.cluster {
clusterGot, _ = fgc.operations.get(fakeProj, data.cluster.Location, data.cluster.Name)
}
gotBoskos := fgc.boskosOps.(*boskosFake.FakeBoskosClient).GetResources()
if !reflect.DeepEqual(err, data.expErr) || !reflect.DeepEqual(clusterGot, data.expCluster) || !reflect.DeepEqual(gotBoskos, data.expBoskos) {
t.Errorf("testing deleting cluster, with:\n\tIs Prow: '%v'\n\texisting cluster: '%v'\n\tboskos state: '%v'\nwant: boskos - '%v', cluster - '%v', err - '%v'\ngot: boskos - '%v', cluster - '%v', err - '%v'",
data.isProw, data.cluster, data.boskosState, data.expBoskos, data.expCluster, data.expErr, nil, clusterGot, err)
}
}
}