pkg/testutils/clustermanager/gke.go

460 lines
14 KiB
Go

/*
Copyright 2019 The Knative Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package clustermanager
import (
"errors"
"fmt"
"log"
"strings"
"time"
container "google.golang.org/api/container/v1beta1"
"knative.dev/pkg/testutils/clustermanager/boskos"
"knative.dev/pkg/testutils/common"
"golang.org/x/net/context"
"golang.org/x/oauth2/google"
)
const (
DefaultGKENumNodes = 1
DefaultGKENodeType = "n1-standard-4"
DefaultGKERegion = "us-central1"
DefaultGKEZone = ""
regionEnv = "E2E_CLUSTER_REGION"
backupRegionEnv = "E2E_CLUSTER_BACKUP_REGIONS"
)
var (
DefaultGKEBackupRegions = []string{"us-west1", "us-east1"}
protectedProjects = []string{"knative-tests"}
protectedClusters = []string{"knative-prow"}
// These are arbitrary numbers determined based on past experience
creationTimeout = 20 * time.Minute
deletionTimeout = 10 * time.Minute
)
// GKEClient implements Client
type GKEClient struct {
}
// GKERequest contains all requests collected for cluster creation
type GKERequest struct {
NumNodes int64
NodeType string
Region string
Zone string
BackupRegions []string
Addons []string
}
// GKECluster implements ClusterOperations
type GKECluster struct {
Request *GKERequest
// Project might be GKE specific, so put it here
Project *string
// NeedCleanup tells whether the cluster needs to be deleted afterwards
// This probably should be part of task wrapper's logic
NeedCleanup bool
Cluster *container.Cluster
operations GKESDKOperations
boskosOps boskos.Operation
}
// GKESDKOperations wraps GKE SDK related functions
type GKESDKOperations interface {
create(string, string, *container.CreateClusterRequest) (*container.Operation, error)
delete(string, string, string) (*container.Operation, error)
get(string, string, string) (*container.Cluster, error)
getOperation(string, string, string) (*container.Operation, error)
}
// GKESDKClient Implement GKESDKOperations
type GKESDKClient struct {
*container.Service
}
func (gsc *GKESDKClient) create(project, location string, rb *container.CreateClusterRequest) (*container.Operation, error) {
parent := fmt.Sprintf("projects/%s/locations/%s", project, location)
return gsc.Projects.Locations.Clusters.Create(parent, rb).Context(context.Background()).Do()
}
// delete deletes GKE cluster and waits until completion
func (gsc *GKESDKClient) delete(project, clusterName, location string) (*container.Operation, error) {
parent := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, clusterName)
return gsc.Projects.Locations.Clusters.Delete(parent).Context(context.Background()).Do()
}
func (gsc *GKESDKClient) get(project, location, cluster string) (*container.Cluster, error) {
clusterFullPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", project, location, cluster)
return gsc.Projects.Locations.Clusters.Get(clusterFullPath).Context(context.Background()).Do()
}
func (gsc *GKESDKClient) getOperation(project, location, opName string) (*container.Operation, error) {
name := fmt.Sprintf("projects/%s/locations/%s/operations/%s", project, location, opName)
return gsc.Service.Projects.Locations.Operations.Get(name).Do()
}
// Setup sets up a GKECluster client.
// numNodes: default to 3 if not provided
// nodeType: default to n1-standard-4 if not provided
// region: default to regional cluster if not provided, and use default backup regions
// zone: default is none, must be provided together with region
// project: no default
// addons: cluster addons to be added to cluster
func (gs *GKEClient) Setup(numNodes *int64, nodeType *string, region *string, zone *string, project *string, addons []string) ClusterOperations {
gc := &GKECluster{
Request: &GKERequest{
NumNodes: DefaultGKENumNodes,
NodeType: DefaultGKENodeType,
Region: DefaultGKERegion,
Zone: DefaultGKEZone,
BackupRegions: DefaultGKEBackupRegions,
Addons: addons,
},
}
if nil != project { // use provided project and create cluster
gc.Project = project
gc.NeedCleanup = true
}
if nil != numNodes {
gc.Request.NumNodes = *numNodes
}
if nil != nodeType {
gc.Request.NodeType = *nodeType
}
if nil != region {
gc.Request.Region = *region
}
if "" != common.GetOSEnv(regionEnv) {
gc.Request.Region = common.GetOSEnv(regionEnv)
}
if "" != common.GetOSEnv(backupRegionEnv) {
gc.Request.BackupRegions = strings.Split(common.GetOSEnv(backupRegionEnv), " ")
}
if nil != zone {
gc.Request.Zone = *zone
gc.Request.BackupRegions = make([]string, 0)
}
ctx := context.Background()
c, err := google.DefaultClient(ctx, container.CloudPlatformScope)
if nil != err {
log.Fatalf("failed create google client: '%v'", err)
}
containerService, err := container.New(c)
if nil != err {
log.Fatalf("failed create container service: '%v'", err)
}
gc.operations = &GKESDKClient{containerService}
gc.boskosOps = &boskos.Client{}
return gc
}
// Initialize sets up GKE SDK client, checks environment for cluster and
// projects to decide whether use existing cluster/project or creating new ones.
func (gc *GKECluster) Initialize() error {
// Try obtain project name via `kubectl`, `gcloud`
if nil == gc.Project {
if err := gc.checkEnvironment(); nil != err {
return fmt.Errorf("failed checking existing cluster: '%v'", err)
} else if nil != gc.Cluster { // return if Cluster was already set by kubeconfig
return nil
}
}
// Get project name from boskos if running in Prow
if nil == gc.Project && common.IsProw() {
project, err := gc.boskosOps.AcquireGKEProject(nil)
if nil != err {
return fmt.Errorf("failed acquire boskos project: '%v'", err)
}
gc.Project = &project.Name
}
if nil == gc.Project || "" == *gc.Project {
return errors.New("gcp project must be set")
}
if !common.IsProw() && nil == gc.Cluster {
gc.NeedCleanup = true
}
log.Printf("Using project %q for running test", *gc.Project)
return nil
}
// Provider returns gke
func (gc *GKECluster) Provider() string {
return "gke"
}
// Acquire gets existing cluster or create a new one, the creation logic
// contains retries in BackupRegions. Default creating cluster
// in us-central1, and default BackupRegions are us-west1 and us-east1. If
// Region or Zone is provided then there is no retries
func (gc *GKECluster) Acquire() error {
gc.ensureProtected()
var err error
// Check if using existing cluster
if nil != gc.Cluster {
return nil
}
// Perform GKE specific cluster creation logics
clusterName, err := getResourceName(ClusterResource)
if nil != err {
return fmt.Errorf("failed getting cluster name: '%v'", err)
}
regions := []string{gc.Request.Region}
for _, br := range gc.Request.BackupRegions {
exist := false
for _, region := range regions {
if br == region {
exist = true
}
}
if !exist {
regions = append(regions, br)
}
}
var cluster *container.Cluster
var op *container.Operation
for i, region := range regions {
// Restore innocence
err = nil
rb := &container.CreateClusterRequest{
Cluster: &container.Cluster{
Name: clusterName,
// Installing addons after cluster creation takes at least 5
// minutes, so install addons as part of cluster creation, which
// doesn't seem to add much time on top of cluster creation
AddonsConfig: gc.getAddonsConfig(),
InitialNodeCount: gc.Request.NumNodes,
NodeConfig: &container.NodeConfig{
MachineType: gc.Request.NodeType,
},
},
ProjectId: *gc.Project,
}
clusterLoc := getClusterLocation(region, gc.Request.Zone)
// Deleting cluster if it already exists
existingCluster, _ := gc.operations.get(*gc.Project, clusterLoc, clusterName)
if nil != existingCluster {
log.Printf("Cluster %q already exists in %q. Deleting...", clusterName, clusterLoc)
op, err = gc.operations.delete(*gc.Project, clusterName, clusterLoc)
if nil == err {
err = gc.wait(clusterLoc, op.Name, deletionTimeout)
}
}
// Creating cluster only if previous step succeeded
if nil == err {
log.Printf("Creating cluster %q in %q", clusterName, clusterLoc)
op, err = gc.operations.create(*gc.Project, clusterLoc, rb)
if nil == err {
if err = gc.wait(clusterLoc, op.Name, creationTimeout); nil == err {
cluster, err = gc.operations.get(*gc.Project, clusterLoc, rb.Cluster.Name)
}
}
}
if nil != err {
errMsg := fmt.Sprintf("Error during cluster creation: '%v'. ", err)
if gc.NeedCleanup { // Delete half created cluster if it's user created
errMsg = fmt.Sprintf("%sDeleting cluster %q in %q in background...\n", errMsg, clusterName, clusterLoc)
go gc.operations.delete(*gc.Project, clusterName, clusterLoc)
}
// Retry another region if cluster creation failed.
// TODO(chaodaiG): catch specific errors as we know what the error look like for stockout etc.
if len(regions) != i+1 {
errMsg = fmt.Sprintf("%sRetry another region %q for cluster creation", errMsg, regions[i+1])
}
log.Printf(errMsg)
} else {
log.Print("Cluster creation completed")
gc.Cluster = cluster
break
}
}
return err
}
// Delete takes care of GKE cluster resource cleanup. It only release Boskos resource if running in
// Prow, otherwise deletes the cluster if marked NeedsCleanup
func (gc *GKECluster) Delete() error {
gc.ensureProtected()
// Release Boskos if running in Prow, will let Janitor taking care of
// clusters deleting
if common.IsProw() {
log.Printf("Releasing Boskos resource: '%v'", *gc.Project)
return gc.boskosOps.ReleaseGKEProject(nil, *gc.Project)
}
// NeedCleanup is only true if running locally and cluster created by the
// process
if !gc.NeedCleanup {
return nil
}
// Should only get here if running locally and cluster created by this
// client, so at this moment cluster should have been set
if nil == gc.Cluster {
return fmt.Errorf("cluster doesn't exist")
}
log.Printf("Deleting cluster %q in %q", gc.Cluster.Name, gc.Cluster.Location)
op, err := gc.operations.delete(*gc.Project, gc.Cluster.Name, gc.Cluster.Location)
if nil == err {
err = gc.wait(gc.Cluster.Location, op.Name, deletionTimeout)
}
if nil != err {
return fmt.Errorf("failed deleting cluster: '%v'", err)
}
return nil
}
// getAddonsConfig gets AddonsConfig from Request, contains the logic of
// converting string argument to typed AddonsConfig, for example `IstioConfig`.
// Currently supports istio
func (gc *GKECluster) getAddonsConfig() *container.AddonsConfig {
const (
// Define all supported addons here
istio = "istio"
)
ac := &container.AddonsConfig{}
for _, name := range gc.Request.Addons {
switch strings.ToLower(name) {
case istio:
ac.IstioConfig = &container.IstioConfig{Disabled: false}
default:
panic(fmt.Sprintf("addon type %q not supported. Has to be one of: %q", name, istio))
}
}
return ac
}
// wait depends on unique opName(operation ID created by cloud), and waits until
// it's done
func (gc *GKECluster) wait(location, opName string, wait time.Duration) error {
const (
pendingStatus = "PENDING"
runningStatus = "RUNNING"
doneStatus = "DONE"
)
var op *container.Operation
var err error
timeout := time.After(wait)
tick := time.Tick(500 * time.Millisecond)
for {
select {
// Got a timeout! fail with a timeout error
case <-timeout:
return errors.New("timed out waiting")
case <-tick:
// Retry 3 times in case of weird network error, or rate limiting
for r, w := 0, 50*time.Microsecond; r < 3; r, w = r+1, w*2 {
op, err = gc.operations.getOperation(*gc.Project, location, opName)
if nil == err {
if op.Status == doneStatus {
return nil
} else if op.Status == pendingStatus || op.Status == runningStatus {
// Valid operation, no need to retry
break
} else {
// Have seen intermittent error state and fixed itself,
// let it retry to avoid too much flakiness
err = fmt.Errorf("unexpected operation status: %q", op.Status)
}
}
time.Sleep(w)
}
// If err still persist after retries, exit
if nil != err {
return err
}
}
}
return err
}
// ensureProtected ensures not operating on protected project/cluster
func (gc *GKECluster) ensureProtected() {
if nil != gc.Project {
for _, pp := range protectedProjects {
if *gc.Project == pp {
log.Fatalf("project %q is protected", *gc.Project)
}
}
}
if nil != gc.Cluster {
for _, pc := range protectedClusters {
if gc.Cluster.Name == pc {
log.Fatalf("cluster %q is protected", gc.Cluster.Name)
}
}
}
}
// checks for existing cluster by looking at kubeconfig,
// and sets up gc.Project and gc.Cluster properly, otherwise fail it.
// if project can be derived from gcloud, sets it up as well
func (gc *GKECluster) checkEnvironment() error {
var err error
// if kubeconfig is configured, use it
output, err := common.StandardExec("kubectl", "config", "current-context")
if nil == err {
currentContext := strings.TrimSpace(string(output))
if strings.HasPrefix(currentContext, "gke_") {
// output should be in the form of gke_PROJECT_REGION_CLUSTER
parts := strings.Split(currentContext, "_")
if len(parts) != 4 { // fall through with warning
log.Printf("WARNING: ignoring kubectl current-context since it's malformed: '%s'", currentContext)
} else {
log.Printf("kubeconfig isn't empty, uses this cluster for running tests: %s", currentContext)
gc.Project = &parts[1]
gc.Cluster, err = gc.operations.get(*gc.Project, parts[2], parts[3])
if nil != err {
return fmt.Errorf("couldn't find cluster %s in %s in %s, does it exist? %v", parts[3], parts[1], parts[2], err)
}
return nil
}
}
}
if nil != err && len(output) > 0 {
// this is unexpected error, should shout out directly
return fmt.Errorf("failed running kubectl config current-context: '%s'", string(output))
}
// if gcloud is pointing to a project, use it
output, err = common.StandardExec("gcloud", "config", "get-value", "project")
if nil != err {
return fmt.Errorf("failed getting gcloud project: '%v'", err)
}
if string(output) != "" {
project := string(output)
gc.Project = &project
}
return nil
}