Move backoff mechanism to utils.

This commit is contained in:
Krzysztof Jastrzebski 2018-06-13 10:02:42 +02:00
parent d36988bec7
commit dd1db7a0ac
4 changed files with 162 additions and 36 deletions

View File

@ -26,6 +26,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/api"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
"k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
@ -110,12 +111,6 @@ type UnregisteredNode struct {
UnregisteredSince time.Time
}
type scaleUpBackoff struct {
duration time.Duration
backoffUntil time.Time
lastFailedScaleUp time.Time
}
// ClusterStateRegistry is a structure to keep track the current state of the cluster.
type ClusterStateRegistry struct {
sync.Mutex
@ -130,7 +125,7 @@ type ClusterStateRegistry struct {
incorrectNodeGroupSizes map[string]IncorrectNodeGroupSize
unregisteredNodes map[string]UnregisteredNode
candidatesForScaleDown map[string][]string
nodeGroupBackoffInfo map[string]scaleUpBackoff
nodeGroupBackoffInfo *backoff.Backoff
lastStatus *api.ClusterAutoscalerStatus
lastScaleDownUpdateTime time.Time
logRecorder *utils.LogEventRecorder
@ -153,7 +148,7 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
incorrectNodeGroupSizes: make(map[string]IncorrectNodeGroupSize),
unregisteredNodes: make(map[string]UnregisteredNode),
candidatesForScaleDown: make(map[string][]string),
nodeGroupBackoffInfo: make(map[string]scaleUpBackoff),
nodeGroupBackoffInfo: backoff.NewBackoff(InitialNodeGroupBackoffDuration, MaxNodeGroupBackoffDuration, NodeGroupBackoffResetTimeout),
lastStatus: emptyStatus,
logRecorder: logRecorder,
}
@ -176,11 +171,7 @@ func (csr *ClusterStateRegistry) RegisterScaleDown(request *ScaleDownRequest) {
// To be executed under a lock.
func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
// clean up stale backoff info
for ngId, backoffInfo := range csr.nodeGroupBackoffInfo {
if backoffInfo.lastFailedScaleUp.Add(NodeGroupBackoffResetTimeout).Before(currentTime) {
delete(csr.nodeGroupBackoffInfo, ngId)
}
}
csr.nodeGroupBackoffInfo.RemoveStaleBackoffData(currentTime)
timedOutSur := make([]*ScaleUpRequest, 0)
newSur := make([]*ScaleUpRequest, 0)
@ -188,7 +179,7 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
if !csr.areThereUpcomingNodesInNodeGroup(sur.NodeGroupName) {
// scale-out finished successfully
// remove it and reset node group backoff
delete(csr.nodeGroupBackoffInfo, sur.NodeGroupName)
csr.nodeGroupBackoffInfo.RemoveBackoff(sur.NodeGroupName)
glog.V(4).Infof("Scale up in group %v finished successfully in %v",
sur.NodeGroupName, currentTime.Sub(sur.Time))
continue
@ -228,24 +219,7 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
// To be executed under a lock.
func (csr *ClusterStateRegistry) backoffNodeGroup(nodeGroupName string, currentTime time.Time) {
duration := InitialNodeGroupBackoffDuration
if backoffInfo, found := csr.nodeGroupBackoffInfo[nodeGroupName]; found {
// Multiple concurrent scale-ups failing shouldn't cause backoff
// duration to increase, so we only increase it if we're not in
// backoff right now.
if backoffInfo.backoffUntil.Before(currentTime) {
duration = 2 * backoffInfo.duration
if duration > MaxNodeGroupBackoffDuration {
duration = MaxNodeGroupBackoffDuration
}
}
}
backoffUntil := currentTime.Add(duration)
csr.nodeGroupBackoffInfo[nodeGroupName] = scaleUpBackoff{
duration: duration,
backoffUntil: backoffUntil,
lastFailedScaleUp: currentTime,
}
backoffUntil := csr.nodeGroupBackoffInfo.Backoff(nodeGroupName, currentTime)
glog.Warningf("Disabling scale-up for node group %v until %v", nodeGroupName, backoffUntil)
}
@ -386,8 +360,7 @@ func (csr *ClusterStateRegistry) IsNodeGroupSafeToScaleUp(nodeGroupName string,
if !csr.IsNodeGroupHealthy(nodeGroupName) {
return false
}
backoffInfo, found := csr.nodeGroupBackoffInfo[nodeGroupName]
return !found || backoffInfo.backoffUntil.Before(now)
return !csr.nodeGroupBackoffInfo.IsBackedOff(nodeGroupName, now)
}
func (csr *ClusterStateRegistry) areThereUpcomingNodesInNodeGroup(nodeGroupName string) bool {

View File

@ -669,6 +669,5 @@ func TestScaleUpBackoff(t *testing.T) {
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.True(t, clusterstate.IsNodeGroupSafeToScaleUp("ng1", now))
_, found := clusterstate.nodeGroupBackoffInfo["ng1"]
assert.False(t, found)
assert.False(t, clusterstate.nodeGroupBackoffInfo.IsBackedOff("ng1", now))
}

View File

@ -0,0 +1,83 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package backoff
import (
"time"
)
type backoffInfo struct {
duration time.Duration
backoffUntil time.Time
lastFailedExecution time.Time
}
// Backoff handles backing off executions.
type Backoff struct {
maxBackoffDuration time.Duration
initialBackoffDuration time.Duration
backoffResetTimeout time.Duration
backoffInfo map[string]backoffInfo
}
// NewBackoff creates an instance of Backoff.
func NewBackoff(initialBackoffDuration time.Duration, maxBackoffDuration time.Duration, backoffResetTimeout time.Duration) *Backoff {
return &Backoff{maxBackoffDuration, initialBackoffDuration, backoffResetTimeout, make(map[string]backoffInfo)}
}
// RemoveStaleBackoffData removes stale backoff data.
func (b *Backoff) RemoveStaleBackoffData(currentTime time.Time) {
for key, backoffInfo := range b.backoffInfo {
if backoffInfo.lastFailedExecution.Add(b.backoffResetTimeout).Before(currentTime) {
delete(b.backoffInfo, key)
}
}
}
// Backoff execution for the given key. Returns time till execution is backed off.
func (b *Backoff) Backoff(key string, currentTime time.Time) time.Time {
duration := b.initialBackoffDuration
if backoffInfo, found := b.backoffInfo[key]; found {
// Multiple concurrent scale-ups failing shouldn't cause backoff
// duration to increase, so we only increase it if we're not in
// backoff right now.
if backoffInfo.backoffUntil.Before(currentTime) {
duration = 2 * backoffInfo.duration
if duration > b.maxBackoffDuration {
duration = b.maxBackoffDuration
}
}
}
backoffUntil := currentTime.Add(duration)
b.backoffInfo[key] = backoffInfo{
duration: duration,
backoffUntil: backoffUntil,
lastFailedExecution: currentTime,
}
return backoffUntil
}
// RemoveBackoff removes backoff data for the given key.
func (b *Backoff) RemoveBackoff(key string) {
delete(b.backoffInfo, key)
}
// IsBackedOff returns true if execution is backed off for the given key.
func (b *Backoff) IsBackedOff(key string, currentTime time.Time) bool {
backoffInfo, found := b.backoffInfo[key]
return found && backoffInfo.backoffUntil.After(currentTime)
}

View File

@ -0,0 +1,71 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package backoff
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func TestBackoffTwoKeys(t *testing.T) {
backoff := NewBackoff(10*time.Minute, time.Hour, 3*time.Hour)
startTime := time.Now()
assert.False(t, backoff.IsBackedOff("key1", startTime))
assert.False(t, backoff.IsBackedOff("key2", startTime))
backoff.Backoff("key1", startTime.Add(time.Minute))
assert.True(t, backoff.IsBackedOff("key1", startTime.Add(2*time.Minute)))
assert.False(t, backoff.IsBackedOff("key2", startTime))
assert.False(t, backoff.IsBackedOff("key1", startTime.Add(11*time.Minute)))
}
func TestMaxBackoff(t *testing.T) {
backoff := NewBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff("key1", startTime)
assert.True(t, backoff.IsBackedOff("key1", startTime))
assert.False(t, backoff.IsBackedOff("key1", startTime.Add(1*time.Minute)))
backoff.Backoff("key1", startTime.Add(1*time.Minute))
assert.True(t, backoff.IsBackedOff("key1", startTime.Add(1*time.Minute)))
assert.False(t, backoff.IsBackedOff("key1", startTime.Add(3*time.Minute)))
backoff.Backoff("key1", startTime.Add(3*time.Minute))
assert.True(t, backoff.IsBackedOff("key1", startTime.Add(3*time.Minute)))
assert.False(t, backoff.IsBackedOff("key1", startTime.Add(6*time.Minute)))
}
func TestRemoveBackoff(t *testing.T) {
backoff := NewBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff("key1", startTime)
assert.True(t, backoff.IsBackedOff("key1", startTime))
backoff.RemoveBackoff("key1")
assert.False(t, backoff.IsBackedOff("key1", startTime))
}
func TestResetStaleBackoffData(t *testing.T) {
backoff := NewBackoff(1*time.Minute, 3*time.Minute, 3*time.Hour)
startTime := time.Now()
backoff.Backoff("key1", startTime)
backoff.Backoff("key2", startTime.Add(time.Hour))
backoff.RemoveStaleBackoffData(startTime.Add(time.Hour))
assert.Equal(t, 2, len(backoff.backoffInfo))
backoff.RemoveStaleBackoffData(startTime.Add(4 * time.Hour))
assert.Equal(t, 1, len(backoff.backoffInfo))
backoff.RemoveStaleBackoffData(startTime.Add(5 * time.Hour))
assert.Equal(t, 0, len(backoff.backoffInfo))
}