Add typed errors; add errors_total metric
To keep reasonable commit size only top-level files use new errors. Will add them in other files in next commits.
This commit is contained in:
parent
b432362a70
commit
f716a7e496
|
|
@ -22,6 +22,7 @@ import (
|
|||
"github.com/golang/glog"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
kube_record "k8s.io/client-go/tools/record"
|
||||
kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
|
|
@ -37,7 +38,7 @@ type AutoscalerOptions struct {
|
|||
// The configuration can be injected at the creation of an autoscaler
|
||||
type Autoscaler interface {
|
||||
// RunOnce represents an iteration in the control-loop of CA
|
||||
RunOnce(currentTime time.Time)
|
||||
RunOnce(currentTime time.Time) *errors.AutoscalerError
|
||||
// CleanUp represents a clean-up required before the first invocation of RunOnce
|
||||
CleanUp()
|
||||
// ExitCleanUp is a clean-up performed just before process termination.
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import (
|
|||
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
kube_record "k8s.io/client-go/tools/record"
|
||||
kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
|
|
@ -56,14 +57,14 @@ func (a *DynamicAutoscaler) ExitCleanUp() {
|
|||
}
|
||||
|
||||
// RunOnce represents a single iteration of a dynamic autoscaler inside the CA's control-loop
|
||||
func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) {
|
||||
func (a *DynamicAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("reconfigure", reconfigureStart)
|
||||
if err := a.Reconfigure(); err != nil {
|
||||
glog.Errorf("Failed to reconfigure : %v", err)
|
||||
}
|
||||
metrics.UpdateDuration("reconfigure", reconfigureStart)
|
||||
a.autoscaler.RunOnce(currentTime)
|
||||
return a.autoscaler.RunOnce(currentTime)
|
||||
}
|
||||
|
||||
// Reconfigure this dynamic autoscaler if the configmap is updated
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ package core
|
|||
import (
|
||||
"github.com/stretchr/testify/mock"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
|
@ -27,8 +28,9 @@ type AutoscalerMock struct {
|
|||
mock.Mock
|
||||
}
|
||||
|
||||
func (m *AutoscalerMock) RunOnce(currentTime time.Time) {
|
||||
func (m *AutoscalerMock) RunOnce(currentTime time.Time) *errors.AutoscalerError {
|
||||
m.Called(currentTime)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *AutoscalerMock) CleanUp() {
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import (
|
|||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
)
|
||||
|
||||
// PollingAutoscaler is a variant of autoscaler which polls the source-of-truth every time RunOnce is invoked
|
||||
|
|
@ -48,14 +49,14 @@ func (a *PollingAutoscaler) ExitCleanUp() {
|
|||
}
|
||||
|
||||
// RunOnce represents a single iteration of a polling autoscaler inside the CA's control-loop
|
||||
func (a *PollingAutoscaler) RunOnce(currentTime time.Time) {
|
||||
func (a *PollingAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
|
||||
reconfigureStart := time.Now()
|
||||
metrics.UpdateLastTime("poll", reconfigureStart)
|
||||
if err := a.Poll(); err != nil {
|
||||
glog.Errorf("Failed to poll : %v", err)
|
||||
}
|
||||
metrics.UpdateDuration("poll", reconfigureStart)
|
||||
a.autoscaler.RunOnce(currentTime)
|
||||
return a.autoscaler.RunOnce(currentTime)
|
||||
}
|
||||
|
||||
// Poll latest data from cloud provider to recreate this autoscaler
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import (
|
|||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
|
||||
kube_record "k8s.io/client-go/tools/record"
|
||||
|
|
@ -72,7 +73,7 @@ func (a *StaticAutoscaler) CleanUp() {
|
|||
}
|
||||
|
||||
// RunOnce iterates over node groups and scales them up/down if necessary
|
||||
func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
||||
func (a *StaticAutoscaler) RunOnce(currentTime time.Time) *errors.AutoscalerError {
|
||||
readyNodeLister := a.ReadyNodeLister()
|
||||
allNodeLister := a.AllNodeLister()
|
||||
unschedulablePodLister := a.UnschedulablePodLister()
|
||||
|
|
@ -85,30 +86,30 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
readyNodes, err := readyNodeLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list ready nodes: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
if len(readyNodes) == 0 {
|
||||
glog.Error("No ready nodes in the cluster")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
allNodes, err := allNodeLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list all nodes: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
if len(allNodes) == 0 {
|
||||
glog.Error("No nodes in the cluster")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
err = a.ClusterStateRegistry.UpdateNodes(allNodes, currentTime)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to update node registry: %v", err)
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
metrics.UpdateClusterState(a.ClusterStateRegistry)
|
||||
|
||||
|
|
@ -122,7 +123,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
if !a.ClusterStateRegistry.IsClusterHealthy() {
|
||||
glog.Warning("Cluster is not ready for autoscaling")
|
||||
scaleDown.CleanUpUnneededNodes()
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("updateClusterState", runStart)
|
||||
|
|
@ -142,12 +143,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
glog.Warningf("Failed to remove unregistered nodes: %v", err)
|
||||
|
||||
}
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
// Some nodes were removed. Let's skip this iteration, the next one should be better.
|
||||
if removedAny {
|
||||
glog.V(0).Infof("Some unregistered nodes were removed, skipping iteration")
|
||||
return
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -157,24 +158,24 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
fixedSomething, err := fixNodeGroupSize(autoscalingContext, time.Now())
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to fix node group sizes: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.CloudProviderError, err)
|
||||
}
|
||||
if fixedSomething {
|
||||
glog.V(0).Infof("Some node group target size was fixed, skipping the iteration")
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
allUnschedulablePods, err := unschedulablePodLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list unscheduled pods: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
metrics.UpdateUnschedulablePodsCount(len(allUnschedulablePods))
|
||||
|
||||
allScheduled, err := scheduledPodLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list scheduled pods: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
|
||||
// We need to reset all pods that have been marked as unschedulable not after
|
||||
|
|
@ -233,11 +234,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to scale up: %v", err)
|
||||
return
|
||||
// TODO(maciekpytel): temporary hack, fix this
|
||||
return nil
|
||||
} else if scaledUp {
|
||||
a.lastScaleUpTime = time.Now()
|
||||
// No scale down in this iteration.
|
||||
return
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -247,7 +249,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
pdbs, err := pdbLister.List()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to list pod disruption budgets: %v", err)
|
||||
return
|
||||
return errors.ToAutoscalerError(errors.ApiCallError, err)
|
||||
}
|
||||
|
||||
// In dry run only utilization is updated
|
||||
|
|
@ -265,7 +267,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
err = scaleDown.UpdateUnneededNodes(allNodes, allScheduled, time.Now(), pdbs)
|
||||
if err != nil {
|
||||
glog.Warningf("Failed to scale down: %v", err)
|
||||
return
|
||||
// TODO(maciekpytel): temporary hack, fix this
|
||||
return nil
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("findUnneeded", unneededStart)
|
||||
|
|
@ -294,6 +297,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) {
|
|||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExitCleanUp removes status configmap.
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import (
|
|||
"k8s.io/autoscaler/cluster-autoscaler/expander"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/metrics"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/simulator"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
|
||||
kube_client "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
kube_leaderelection "k8s.io/kubernetes/pkg/client/leaderelection"
|
||||
|
|
@ -191,7 +192,10 @@ func run(_ <-chan struct{}) {
|
|||
loopStart := time.Now()
|
||||
metrics.UpdateLastTime("main", loopStart)
|
||||
|
||||
autoscaler.RunOnce(loopStart)
|
||||
err := autoscaler.RunOnce(loopStart)
|
||||
if err != nil && err.Type() != errors.TransientError {
|
||||
metrics.RegisterError(err)
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("main", loopStart)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import (
|
|||
"time"
|
||||
|
||||
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
|
||||
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
|
@ -86,6 +87,14 @@ var (
|
|||
)
|
||||
|
||||
/**** Metrics related to autoscaler operations ****/
|
||||
errorsCount = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: caNamespace,
|
||||
Name: "errors_total",
|
||||
Help: "The number of CA loops failed due to an error.",
|
||||
}, []string{"type"},
|
||||
)
|
||||
|
||||
scaleUpCount = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: caNamespace,
|
||||
|
|
@ -125,6 +134,7 @@ func init() {
|
|||
prometheus.MustRegister(unschedulablePodsCount)
|
||||
prometheus.MustRegister(lastActivity)
|
||||
prometheus.MustRegister(functionDuration)
|
||||
prometheus.MustRegister(errorsCount)
|
||||
prometheus.MustRegister(scaleUpCount)
|
||||
prometheus.MustRegister(scaleDownCount)
|
||||
prometheus.MustRegister(evictionsCount)
|
||||
|
|
@ -166,6 +176,12 @@ func UpdateUnschedulablePodsCount(podsCount int) {
|
|||
unschedulablePodsCount.Set(float64(podsCount))
|
||||
}
|
||||
|
||||
// RegisterError records any errors preventing Cluster Autoscaler from working.
|
||||
// No more than one error should be recorded per loop.
|
||||
func RegisterError(err *errors.AutoscalerError) {
|
||||
errorsCount.WithLabelValues(string(err.Type())).Add(1.0)
|
||||
}
|
||||
|
||||
// RegisterScaleUp records number of nodes added by scale up
|
||||
func RegisterScaleUp(nodesCount int) {
|
||||
scaleUpCount.Add(float64(nodesCount))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package errors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// AutoscalerErrorType describes a high-level category of a given error
|
||||
type AutoscalerErrorType string
|
||||
|
||||
// AutoscalerError contains information about Autoscaler errors
|
||||
type AutoscalerError struct {
|
||||
errorType AutoscalerErrorType
|
||||
msg string
|
||||
}
|
||||
|
||||
const (
|
||||
// CloudProviderError is an error related to underlying infrastructure
|
||||
CloudProviderError AutoscalerErrorType = "cloudProviderError"
|
||||
// ApiCallError is an error related to communication with k8s API server
|
||||
ApiCallError AutoscalerErrorType = "apiCallError"
|
||||
// InternalError is an error inside Cluster Autoscaler
|
||||
InternalError AutoscalerErrorType = "internalError"
|
||||
// TransientError is an error that causes us to skip a single loop, but
|
||||
// does not require any additional action.
|
||||
TransientError AutoscalerErrorType = "transientError"
|
||||
)
|
||||
|
||||
// NewAutoscalerError returns new autoscaler error with a message constructed from format string
|
||||
func NewAutoscalerError(errorType AutoscalerErrorType, msg string, args ...interface{}) *AutoscalerError {
|
||||
return &AutoscalerError{
|
||||
errorType: errorType,
|
||||
msg: fmt.Sprintf(msg, args...),
|
||||
}
|
||||
}
|
||||
|
||||
// ToAutoscalerError converts an error to AutoscalerError with given type,
|
||||
// unless it already is an AutoscalerError (in which case it's not modified).
|
||||
func ToAutoscalerError(defaultType AutoscalerErrorType, err error) *AutoscalerError {
|
||||
if e, ok := err.(*AutoscalerError); ok {
|
||||
return e
|
||||
}
|
||||
return NewAutoscalerError(defaultType, err.Error())
|
||||
}
|
||||
|
||||
// Error implements golang error interface
|
||||
func (e *AutoscalerError) Error() string {
|
||||
return e.msg
|
||||
}
|
||||
|
||||
// Type returns the typ of AutoscalerError
|
||||
func (e *AutoscalerError) Type() AutoscalerErrorType {
|
||||
return e.errorType
|
||||
}
|
||||
Loading…
Reference in New Issue