Add typed errors; add errors_total metric

To keep reasonable commit size only top-level files use new errors. Will add them in other files in next commits.
2017-05-16 16:56:54 +02:00 · 2017-05-16 16:56:54 +02:00 · f716a7e496
parent b432362a70
commit f716a7e496
8 changed files with 122 additions and 24 deletions
--- a/cluster-autoscaler/core/autoscaler.go
+++ b/cluster-autoscaler/core/autoscaler.go
@ -22,6 +22,7 @@ import (
 	"github.com/golang/glog"
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	kube_record "k8s.io/client-go/tools/record"
 	kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
@ -37,7 +38,7 @@ type AutoscalerOptions struct {
 // The configuration can be injected at the creation of an autoscaler
 type Autoscaler interface {
 	// RunOnce represents an iteration in the control-loop of CA
-	RunOnce(currentTime time.Time)
+	RunOnce(currentTime time.Time) *errors.AutoscalerError
 	// CleanUp represents a clean-up required before the first invocation of RunOnce
 	CleanUp()
 	// ExitCleanUp is a clean-up performed just before process termination.
--- a/cluster-autoscaler/core/dynamic_autoscaler.go
+++ b/cluster-autoscaler/core/dynamic_autoscaler.go
@ -24,6 +24,7 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	"k8s.io/autoscaler/cluster-autoscaler/metrics"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	kube_record "k8s.io/client-go/tools/record"
 	kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
@ -56,14 +57,14 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
 }

 // RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
-func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) {
+func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
 	reconfigureStart := time.Now()
 	metrics.UpdateLastTime("reconfigure", reconfigureStart)
 	if err := a.Reconfigure(); err != nil {
 		glog.Errorf("Failed to reconfigure : %v", err)
 	}
 	metrics.UpdateDuration("reconfigure", reconfigureStart)
-	a.autoscaler.RunOnce(currentTime)
+	return a.autoscaler.RunOnce(currentTime)
 }

 // Reconfigure this dynamic autoscaler if the configmap is updated
--- a/cluster-autoscaler/core/dynamic_autoscaler_test.go
+++ b/cluster-autoscaler/core/dynamic_autoscaler_test.go
@ -19,6 +19,7 @@ package core
 import (
 	"github.com/stretchr/testify/mock"
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"testing"
 	"time"
 )
@ -27,8 +28,9 @@ type AutoscalerMock struct {
 	mock.Mock
 }

-func (m *AutoscalerMock) RunOnce(currentTime time.Time) {
+func (m *AutoscalerMock) RunOnce(currentTime time.Time) *errors.AutoscalerError {
 	m.Called(currentTime)
+	return nil
 }

 func (m *AutoscalerMock) CleanUp() {
--- a/cluster-autoscaler/core/polling_autoscaler.go
+++ b/cluster-autoscaler/core/polling_autoscaler.go
@ -21,6 +21,7 @@ import (

 	"github.com/golang/glog"
 	"k8s.io/autoscaler/cluster-autoscaler/metrics"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 )

 // PollingAutoscaler is a variant of autoscaler which polls the source-of-truth every time RunOnce is invoked
@ -48,14 +49,14 @@ func (a *PollingAutoscaler) ExitCleanUp() {
 }

 // RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
-func (a *PollingAutoscaler) RunOnce(currentTime time.Time) {
+func (a *PollingAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
 	reconfigureStart := time.Now()
 	metrics.UpdateLastTime("poll", reconfigureStart)
 	if err := a.Poll(); err != nil {
 		glog.Errorf("Failed to poll : %v", err)
 	}
 	metrics.UpdateDuration("poll", reconfigureStart)
-	a.autoscaler.RunOnce(currentTime)
+	return a.autoscaler.RunOnce(currentTime)
 }

 // Poll latest data from cloud provider to recreate this autoscaler
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@ -21,6 +21,7 @@ import (

 	"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
 	"k8s.io/autoscaler/cluster-autoscaler/metrics"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"

 	kube_record "k8s.io/client-go/tools/record"
@ -72,7 +73,7 @@ func (a *StaticAutoscaler) CleanUp() {
 }

 // RunOnce iterates over node groups and scales them up/down if necessary
-func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
+func (a *StaticAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
 	readyNodeLister := a.ReadyNodeLister()
 	allNodeLister := a.AllNodeLister()
 	unschedulablePodLister := a.UnschedulablePodLister()
@ -85,30 +86,30 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 	readyNodes, err := readyNodeLister.List()
 	if err != nil {
 		glog.Errorf("Failed to list ready nodes: %v", err)
-		return
+		return errors.ToAutoscalerError(errors.ApiCallError, err)
 	}
 	if len(readyNodes) == 0 {
 		glog.Error("No ready nodes in the cluster")
 		scaleDown.CleanUpUnneededNodes()
-		return
+		return nil
 	}

 	allNodes, err := allNodeLister.List()
 	if err != nil {
 		glog.Errorf("Failed to list all nodes: %v", err)
-		return
+		return errors.ToAutoscalerError(errors.ApiCallError, err)
 	}
 	if len(allNodes) == 0 {
 		glog.Error("No nodes in the cluster")
 		scaleDown.CleanUpUnneededNodes()
-		return
+		return nil
 	}

 	err = a.ClusterStateRegistry.UpdateNodes(allNodes, currentTime)
 	if err != nil {
 		glog.Errorf("Failed to update node registry: %v", err)
 		scaleDown.CleanUpUnneededNodes()
-		return
+		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
 	metrics.UpdateClusterState(a.ClusterStateRegistry)

@ -122,7 +123,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 	if !a.ClusterStateRegistry.IsClusterHealthy() {
 		glog.Warning("Cluster is not ready for autoscaling")
 		scaleDown.CleanUpUnneededNodes()
-		return
+		return nil
 	}

 	metrics.UpdateDuration("updateClusterState", runStart)
@ -142,12 +143,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 				glog.Warningf("Failed to remove unregistered nodes: %v", err)

 			}
-			return
+			return errors.ToAutoscalerError(errors.CloudProviderError, err)
 		}
 		// Some nodes were removed. Let's skip this iteration, the next one should be better.
 		if removedAny {
 			glog.V(0).Infof("Some unregistered nodes were removed, skipping iteration")
-			return
+			return nil
 		}
 	}

@ -157,24 +158,24 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 	fixedSomething, err := fixNodeGroupSize(autoscalingContext, time.Now())
 	if err != nil {
 		glog.Warningf("Failed to fix node group sizes: %v", err)
-		return
+		return errors.ToAutoscalerError(errors.CloudProviderError, err)
 	}
 	if fixedSomething {
 		glog.V(0).Infof("Some node group target size was fixed, skipping the iteration")
-		return
+		return nil
 	}

 	allUnschedulablePods, err := unschedulablePodLister.List()
 	if err != nil {
 		glog.Errorf("Failed to list unscheduled pods: %v", err)
-		return
+		return errors.ToAutoscalerError(errors.ApiCallError, err)
 	}
 	metrics.UpdateUnschedulablePodsCount(len(allUnschedulablePods))

 	allScheduled, err := scheduledPodLister.List()
 	if err != nil {
 		glog.Errorf("Failed to list scheduled pods: %v", err)
-		return
+		return errors.ToAutoscalerError(errors.ApiCallError, err)
 	}

 	// We need to reset all pods that have been marked as unschedulable not after
@ -233,11 +234,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {

 		if err != nil {
 			glog.Errorf("Failed to scale up: %v", err)
-			return
+			// TODO(maciekpytel): temporary hack, fix this
+			return nil
 		} else if scaledUp {
 			a.lastScaleUpTime = time.Now()
 			// No scale down in this iteration.
-			return
+			return nil
 		}
 	}

@ -247,7 +249,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 		pdbs, err := pdbLister.List()
 		if err != nil {
 			glog.Errorf("Failed to list pod disruption budgets: %v", err)
-			return
+			return errors.ToAutoscalerError(errors.ApiCallError, err)
 		}

 		// In dry run only utilization is updated
@ -265,7 +267,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 		err = scaleDown.UpdateUnneededNodes(allNodes, allScheduled, time.Now(), pdbs)
 		if err != nil {
 			glog.Warningf("Failed to scale down: %v", err)
-			return
+			// TODO(maciekpytel): temporary hack, fix this
+			return nil
 		}

 		metrics.UpdateDuration("findUnneeded", unneededStart)
@ -294,6 +297,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
 			}
 		}
 	}
+	return nil
 }

 // ExitCleanUp removes status configmap.
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@ -35,6 +35,7 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/expander"
 	"k8s.io/autoscaler/cluster-autoscaler/metrics"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
 	kube_leaderelection "k8s.io/kubernetes/pkg/client/leaderelection"
@ -191,7 +192,10 @@ func run(_ <-chan struct{}) {
 				loopStart := time.Now()
 				metrics.UpdateLastTime("main", loopStart)

-				autoscaler.RunOnce(loopStart)
+				err := autoscaler.RunOnce(loopStart)
+				if err != nil && err.Type() != errors.TransientError {
+					metrics.RegisterError(err)
+				}

 				metrics.UpdateDuration("main", loopStart)
 			}
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@ -21,6 +21,7 @@ import (
 	"time"

 	"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"

 	"github.com/prometheus/client_golang/prometheus"
 )
@ -86,6 +87,14 @@ var (
 	)

 	/**** Metrics related to autoscaler operations ****/
+	errorsCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: caNamespace,
+			Name:      "errors_total",
+			Help:      "The number of CA loops failed due to an error.",
+		}, []string{"type"},
+	)
+
 	scaleUpCount = prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Namespace: caNamespace,
@ -125,6 +134,7 @@ func init() {
 	prometheus.MustRegister(unschedulablePodsCount)
 	prometheus.MustRegister(lastActivity)
 	prometheus.MustRegister(functionDuration)
+	prometheus.MustRegister(errorsCount)
 	prometheus.MustRegister(scaleUpCount)
 	prometheus.MustRegister(scaleDownCount)
 	prometheus.MustRegister(evictionsCount)
@ -166,6 +176,12 @@ func UpdateUnschedulablePodsCount(podsCount int) {
 	unschedulablePodsCount.Set(float64(podsCount))
 }

+// RegisterError records any errors preventing Cluster Autoscaler from working.
+// No more than one error should be recorded per loop.
+func RegisterError(err *errors.AutoscalerError) {
+	errorsCount.WithLabelValues(string(err.Type())).Add(1.0)
+}
+
 // RegisterScaleUp records number of nodes added by scale up
 func RegisterScaleUp(nodesCount int) {
 	scaleUpCount.Add(float64(nodesCount))
--- a/cluster-autoscaler/utils/errors/errors.go
+++ b/cluster-autoscaler/utils/errors/errors.go
@ -0,0 +1,69 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package errors
+
+import (
+	"fmt"
+)
+
+// AutoscalerErrorType describes a high-level category of a given error
+type AutoscalerErrorType string
+
+// AutoscalerError contains information about Autoscaler errors
+type AutoscalerError struct {
+	errorType AutoscalerErrorType
+	msg       string
+}
+
+const (
+	// CloudProviderError is an error related to underlying infrastructure
+	CloudProviderError AutoscalerErrorType = "cloudProviderError"
+	// ApiCallError is an error related to communication with k8s API server
+	ApiCallError AutoscalerErrorType = "apiCallError"
+	// InternalError is an error inside Cluster Autoscaler
+	InternalError AutoscalerErrorType = "internalError"
+	// TransientError is an error that causes us to skip a single loop, but
+	// does not require any additional action.
+	TransientError AutoscalerErrorType = "transientError"
+)
+
+// NewAutoscalerError returns new autoscaler error with a message constructed from format string
+func NewAutoscalerError(errorType AutoscalerErrorType, msg string, args ...interface{}) *AutoscalerError {
+	return &AutoscalerError{
+		errorType: errorType,
+		msg:       fmt.Sprintf(msg, args...),
+	}
+}
+
+// ToAutoscalerError converts an error to AutoscalerError with given type,
+// unless it already is an AutoscalerError (in which case it's not modified).
+func ToAutoscalerError(defaultType AutoscalerErrorType, err error) *AutoscalerError {
+	if e, ok := err.(*AutoscalerError); ok {
+		return e
+	}
+	return NewAutoscalerError(defaultType, err.Error())
+}
+
+// Error implements golang error interface
+func (e *AutoscalerError) Error() string {
+	return e.msg
+}
+
+// Type returns the typ of AutoscalerError
+func (e *AutoscalerError) Type() AutoscalerErrorType {
+	return e.errorType
+}