/* Copyright 2019 The Knative Authors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package clustermanager import ( "errors" "fmt" "log" "strings" "time" container "google.golang.org/api/container/v1beta1" "knative.dev/pkg/testutils/clustermanager/boskos" "knative.dev/pkg/testutils/common" "golang.org/x/net/context" "golang.org/x/oauth2/google" ) const ( DefaultGKENumNodes = 1 DefaultGKENodeType = "n1-standard-4" DefaultGKERegion = "us-central1" DefaultGKEZone = "" regionEnv = "E2E_CLUSTER_REGION" backupRegionEnv = "E2E_CLUSTER_BACKUP_REGIONS" ) var ( DefaultGKEBackupRegions = []string{"us-west1", "us-east1"} protectedProjects = []string{"knative-tests"} protectedClusters = []string{"knative-prow"} // These are arbitrary numbers determined based on past experience creationTimeout = 20 * time.Minute deletionTimeout = 10 * time.Minute ) // GKEClient implements Client type GKEClient struct { } // GKERequest contains all requests collected for cluster creation type GKERequest struct { NumNodes int64 NodeType string Region string Zone string BackupRegions []string Addons []string } // GKECluster implements ClusterOperations type GKECluster struct { Request *GKERequest // Project might be GKE specific, so put it here Project *string // NeedCleanup tells whether the cluster needs to be deleted afterwards // This probably should be part of task wrapper's logic NeedCleanup bool Cluster *container.Cluster operations GKESDKOperations boskosOps boskos.Operation } // GKESDKOperations wraps GKE SDK related functions type GKESDKOperations interface { create(string, string, *container.CreateClusterRequest) (*container.Operation, error) delete(string, string, string) (*container.Operation, error) get(string, string, string) (*container.Cluster, error) getOperation(string, string, string) (*container.Operation, error) } // GKESDKClient Implement GKESDKOperations type GKESDKClient struct { *container.Service } func (gsc *GKESDKClient) create(project, location string, rb *container.CreateClusterRequest) (*container.Operation, error) { parent := fmt.Sprintf("projects/%s/locations/%s", project, location) return gsc.Projects.Locations.Clusters.Create(parent, rb).Context(context.Background()).Do() } // delete deletes GKE cluster and waits until completion func (gsc *GKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) { parent := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, clusterName) return gsc.Projects.Locations.Clusters.Delete(parent).Context(context.Background()).Do() } func (gsc *GKESDKClient) get(project, location, cluster string) (*container.Cluster, error) { clusterFullPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, cluster) return gsc.Projects.Locations.Clusters.Get(clusterFullPath).Context(context.Background()).Do() } func (gsc *GKESDKClient) getOperation(project, location, opName string) (*container.Operation, error) { name := fmt.Sprintf("projects/%s/locations/%s/operations/%s", project, location, opName) return gsc.Service.Projects.Locations.Operations.Get(name).Do() } // Setup sets up a GKECluster client. // numNodes: default to 3 if not provided // nodeType: default to n1-standard-4 if not provided // region: default to regional cluster if not provided, and use default backup regions // zone: default is none, must be provided together with region // project: no default // addons: cluster addons to be added to cluster func (gs *GKEClient) Setup(numNodes *int64, nodeType *string, region *string, zone *string, project *string, addons []string) ClusterOperations { gc := &GKECluster{ Request: &GKERequest{ NumNodes: DefaultGKENumNodes, NodeType: DefaultGKENodeType, Region: DefaultGKERegion, Zone: DefaultGKEZone, BackupRegions: DefaultGKEBackupRegions, Addons: addons, }, } if nil != project { // use provided project and create cluster gc.Project = project gc.NeedCleanup = true } if nil != numNodes { gc.Request.NumNodes = *numNodes } if nil != nodeType { gc.Request.NodeType = *nodeType } if nil != region { gc.Request.Region = *region } if "" != common.GetOSEnv(regionEnv) { gc.Request.Region = common.GetOSEnv(regionEnv) } if "" != common.GetOSEnv(backupRegionEnv) { gc.Request.BackupRegions = strings.Split(common.GetOSEnv(backupRegionEnv), " ") } if nil != zone { gc.Request.Zone = *zone gc.Request.BackupRegions = make([]string, 0) } ctx := context.Background() c, err := google.DefaultClient(ctx, container.CloudPlatformScope) if nil != err { log.Fatalf("failed create google client: '%v'", err) } containerService, err := container.New(c) if nil != err { log.Fatalf("failed create container service: '%v'", err) } gc.operations = &GKESDKClient{containerService} gc.boskosOps = &boskos.Client{} return gc } // Initialize sets up GKE SDK client, checks environment for cluster and // projects to decide whether use existing cluster/project or creating new ones. func (gc *GKECluster) Initialize() error { // Try obtain project name via `kubectl`, `gcloud` if nil == gc.Project { if err := gc.checkEnvironment(); nil != err { return fmt.Errorf("failed checking existing cluster: '%v'", err) } else if nil != gc.Cluster { // return if Cluster was already set by kubeconfig return nil } } // Get project name from boskos if running in Prow if nil == gc.Project && common.IsProw() { project, err := gc.boskosOps.AcquireGKEProject(nil) if nil != err { return fmt.Errorf("failed acquire boskos project: '%v'", err) } gc.Project = &project.Name } if nil == gc.Project || "" == *gc.Project { return errors.New("gcp project must be set") } if !common.IsProw() && nil == gc.Cluster { gc.NeedCleanup = true } log.Printf("Using project %q for running test", *gc.Project) return nil } // Provider returns gke func (gc *GKECluster) Provider() string { return "gke" } // Acquire gets existing cluster or create a new one, the creation logic // contains retries in BackupRegions. Default creating cluster // in us-central1, and default BackupRegions are us-west1 and us-east1. If // Region or Zone is provided then there is no retries func (gc *GKECluster) Acquire() error { gc.ensureProtected() var err error // Check if using existing cluster if nil != gc.Cluster { return nil } // Perform GKE specific cluster creation logics clusterName, err := getResourceName(ClusterResource) if nil != err { return fmt.Errorf("failed getting cluster name: '%v'", err) } regions := []string{gc.Request.Region} for _, br := range gc.Request.BackupRegions { exist := false for _, region := range regions { if br == region { exist = true } } if !exist { regions = append(regions, br) } } var cluster *container.Cluster var op *container.Operation for i, region := range regions { // Restore innocence err = nil rb := &container.CreateClusterRequest{ Cluster: &container.Cluster{ Name: clusterName, // Installing addons after cluster creation takes at least 5 // minutes, so install addons as part of cluster creation, which // doesn't seem to add much time on top of cluster creation AddonsConfig: gc.getAddonsConfig(), InitialNodeCount: gc.Request.NumNodes, NodeConfig: &container.NodeConfig{ MachineType: gc.Request.NodeType, }, }, ProjectId: *gc.Project, } clusterLoc := getClusterLocation(region, gc.Request.Zone) // Deleting cluster if it already exists existingCluster, _ := gc.operations.get(*gc.Project, clusterLoc, clusterName) if nil != existingCluster { log.Printf("Cluster %q already exists in %q. Deleting...", clusterName, clusterLoc) op, err = gc.operations.delete(*gc.Project, clusterName, clusterLoc) if nil == err { err = gc.wait(clusterLoc, op.Name, deletionTimeout) } } // Creating cluster only if previous step succeeded if nil == err { log.Printf("Creating cluster %q in %q", clusterName, clusterLoc) op, err = gc.operations.create(*gc.Project, clusterLoc, rb) if nil == err { if err = gc.wait(clusterLoc, op.Name, creationTimeout); nil == err { cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name) } } } if nil != err { errMsg := fmt.Sprintf("Error during cluster creation: '%v'. ", err) if gc.NeedCleanup { // Delete half created cluster if it's user created errMsg = fmt.Sprintf("%sDeleting cluster %q in %q in background...\n", errMsg, clusterName, clusterLoc) go gc.operations.delete(*gc.Project, clusterName, clusterLoc) } // Retry another region if cluster creation failed. // TODO(chaodaiG): catch specific errors as we know what the error look like for stockout etc. if len(regions) != i+1 { errMsg = fmt.Sprintf("%sRetry another region %q for cluster creation", errMsg, regions[i+1]) } log.Printf(errMsg) } else { log.Print("Cluster creation completed") gc.Cluster = cluster break } } return err } // Delete takes care of GKE cluster resource cleanup. It only release Boskos resource if running in // Prow, otherwise deletes the cluster if marked NeedsCleanup func (gc *GKECluster) Delete() error { gc.ensureProtected() // Release Boskos if running in Prow, will let Janitor taking care of // clusters deleting if common.IsProw() { log.Printf("Releasing Boskos resource: '%v'", *gc.Project) return gc.boskosOps.ReleaseGKEProject(nil, *gc.Project) } // NeedCleanup is only true if running locally and cluster created by the // process if !gc.NeedCleanup { return nil } // Should only get here if running locally and cluster created by this // client, so at this moment cluster should have been set if nil == gc.Cluster { return fmt.Errorf("cluster doesn't exist") } log.Printf("Deleting cluster %q in %q", gc.Cluster.Name, gc.Cluster.Location) op, err := gc.operations.delete(*gc.Project, gc.Cluster.Name, gc.Cluster.Location) if nil == err { err = gc.wait(gc.Cluster.Location, op.Name, deletionTimeout) } if nil != err { return fmt.Errorf("failed deleting cluster: '%v'", err) } return nil } // getAddonsConfig gets AddonsConfig from Request, contains the logic of // converting string argument to typed AddonsConfig, for example `IstioConfig`. // Currently supports istio func (gc *GKECluster) getAddonsConfig() *container.AddonsConfig { const ( // Define all supported addons here istio = "istio" ) ac := &container.AddonsConfig{} for _, name := range gc.Request.Addons { switch strings.ToLower(name) { case istio: ac.IstioConfig = &container.IstioConfig{Disabled: false} default: panic(fmt.Sprintf("addon type %q not supported. Has to be one of: %q", name, istio)) } } return ac } // wait depends on unique opName(operation ID created by cloud), and waits until // it's done func (gc *GKECluster) wait(location, opName string, wait time.Duration) error { const ( pendingStatus = "PENDING" runningStatus = "RUNNING" doneStatus = "DONE" ) var op *container.Operation var err error timeout := time.After(wait) tick := time.Tick(500 * time.Millisecond) for { select { // Got a timeout! fail with a timeout error case <-timeout: return errors.New("timed out waiting") case <-tick: // Retry 3 times in case of weird network error, or rate limiting for r, w := 0, 50*time.Microsecond; r < 3; r, w = r+1, w*2 { op, err = gc.operations.getOperation(*gc.Project, location, opName) if nil == err { if op.Status == doneStatus { return nil } else if op.Status == pendingStatus || op.Status == runningStatus { // Valid operation, no need to retry break } else { // Have seen intermittent error state and fixed itself, // let it retry to avoid too much flakiness err = fmt.Errorf("unexpected operation status: %q", op.Status) } } time.Sleep(w) } // If err still persist after retries, exit if nil != err { return err } } } return err } // ensureProtected ensures not operating on protected project/cluster func (gc *GKECluster) ensureProtected() { if nil != gc.Project { for _, pp := range protectedProjects { if *gc.Project == pp { log.Fatalf("project %q is protected", *gc.Project) } } } if nil != gc.Cluster { for _, pc := range protectedClusters { if gc.Cluster.Name == pc { log.Fatalf("cluster %q is protected", gc.Cluster.Name) } } } } // checks for existing cluster by looking at kubeconfig, // and sets up gc.Project and gc.Cluster properly, otherwise fail it. // if project can be derived from gcloud, sets it up as well func (gc *GKECluster) checkEnvironment() error { var err error // if kubeconfig is configured, use it output, err := common.StandardExec("kubectl", "config", "current-context") if nil == err { currentContext := strings.TrimSpace(string(output)) if strings.HasPrefix(currentContext, "gke_") { // output should be in the form of gke_PROJECT_REGION_CLUSTER parts := strings.Split(currentContext, "_") if len(parts) != 4 { // fall through with warning log.Printf("WARNING: ignoring kubectl current-context since it's malformed: '%s'", currentContext) } else { log.Printf("kubeconfig isn't empty, uses this cluster for running tests: %s", currentContext) gc.Project = &parts[1] gc.Cluster, err = gc.operations.get(*gc.Project, parts[2], parts[3]) if nil != err { return fmt.Errorf("couldn't find cluster %s in %s in %s, does it exist? %v", parts[3], parts[1], parts[2], err) } return nil } } } if nil != err && len(output) > 0 { // this is unexpected error, should shout out directly return fmt.Errorf("failed running kubectl config current-context: '%s'", string(output)) } // if gcloud is pointing to a project, use it output, err = common.StandardExec("gcloud", "config", "get-value", "project") if nil != err { return fmt.Errorf("failed getting gcloud project: '%v'", err) } if string(output) != "" { project := string(output) gc.Project = &project } return nil }