Add failing health check if autoscaler loop consistently returns error
This commit is contained in:
parent
5988fece28
commit
972772440a
|
|
@ -99,7 +99,8 @@ var (
|
|||
"Type of node group expander to be used in scale up. Available values: ["+strings.Join(expander.AvailableExpanders, ",")+"]")
|
||||
|
||||
writeStatusConfigMapFlag = flag.Bool("write-status-configmap", true, "Should CA write status information to a configmap")
|
||||
maxInactivityFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
|
||||
maxInactivityTimeFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
|
||||
maxFailingTimeFlag = flag.Duration("max-failing-time", 15*time.Minute, "Maximum time from last recorded successful autoscaler run before automatic restart")
|
||||
)
|
||||
|
||||
func createAutoscalerOptions() core.AutoscalerOptions {
|
||||
|
|
@ -195,6 +196,8 @@ func run(healthCheck *metrics.HealthCheck) {
|
|||
err := autoscaler.RunOnce(loopStart)
|
||||
if err != nil && err.Type() != errors.TransientError {
|
||||
metrics.RegisterError(err)
|
||||
} else {
|
||||
healthCheck.UpdateLastSuccessfulRun(time.Now())
|
||||
}
|
||||
|
||||
metrics.UpdateDuration("main", loopStart)
|
||||
|
|
@ -212,7 +215,7 @@ func main() {
|
|||
"Can be used multiple times. Format: <min>:<max>:<other...>")
|
||||
kube_flag.InitFlags()
|
||||
|
||||
healthCheck := metrics.NewHealthCheck(*maxInactivityFlag)
|
||||
healthCheck := metrics.NewHealthCheck(*maxInactivityTimeFlag, *maxFailingTimeFlag)
|
||||
|
||||
glog.Infof("Cluster Autoscaler %s", ClusterAutoscalerVersion)
|
||||
|
||||
|
|
|
|||
|
|
@ -25,19 +25,24 @@ import (
|
|||
|
||||
// HealthCheck contains information about last time of autoscaler activity and timeout
|
||||
type HealthCheck struct {
|
||||
lastActivity time.Time
|
||||
mutex *sync.Mutex
|
||||
timeout time.Duration
|
||||
checkTimeout bool
|
||||
lastActivity time.Time
|
||||
lastSuccessfulRun time.Time
|
||||
mutex *sync.Mutex
|
||||
activityTimeout time.Duration
|
||||
successTimeout time.Duration
|
||||
checkTimeout bool
|
||||
}
|
||||
|
||||
// NewHealthCheck builds new HealthCheck object with given timeout
|
||||
func NewHealthCheck(timeout time.Duration) *HealthCheck {
|
||||
func NewHealthCheck(activityTimeout, successTimeout time.Duration) *HealthCheck {
|
||||
now := time.Now()
|
||||
return &HealthCheck{
|
||||
lastActivity: time.Now(),
|
||||
mutex: &sync.Mutex{},
|
||||
timeout: timeout,
|
||||
checkTimeout: false,
|
||||
lastActivity: now,
|
||||
lastSuccessfulRun: now,
|
||||
mutex: &sync.Mutex{},
|
||||
activityTimeout: activityTimeout,
|
||||
successTimeout: successTimeout,
|
||||
checkTimeout: false,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -50,18 +55,27 @@ func (hc *HealthCheck) StartMonitoring() {
|
|||
if now.After(hc.lastActivity) {
|
||||
hc.lastActivity = now
|
||||
}
|
||||
if now.After(hc.lastSuccessfulRun) {
|
||||
hc.lastSuccessfulRun = now
|
||||
}
|
||||
}
|
||||
|
||||
// ServeHTTP implements http.Handler interface to provide a health-check endpoint
|
||||
func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
hc.mutex.Lock()
|
||||
|
||||
lastActivity := hc.lastActivity
|
||||
timedOut := hc.checkTimeout && time.Now().After(lastActivity.Add(hc.timeout))
|
||||
lastSuccessfulRun := hc.lastSuccessfulRun
|
||||
now := time.Now()
|
||||
activityTimedOut := now.After(lastActivity.Add(hc.activityTimeout))
|
||||
successTimedOut := now.After(lastSuccessfulRun.Add(hc.successTimeout))
|
||||
timedOut := hc.checkTimeout && (activityTimedOut || successTimedOut)
|
||||
|
||||
hc.mutex.Unlock()
|
||||
|
||||
if timedOut {
|
||||
w.WriteHeader(500)
|
||||
w.Write([]byte(fmt.Sprintf("Error: last activity more than %v ago", time.Now().Sub(lastActivity).String())))
|
||||
w.Write([]byte(fmt.Sprintf("Error: last activity more %v ago, last success more than %v ago", time.Now().Sub(lastActivity).String(), time.Now().Sub(lastSuccessfulRun).String())))
|
||||
} else {
|
||||
w.WriteHeader(200)
|
||||
w.Write([]byte("OK"))
|
||||
|
|
@ -76,3 +90,16 @@ func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time) {
|
|||
hc.lastActivity = timestamp
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateLastSuccessfulRun updates last time of successful (i.e. not ending in error) activity
|
||||
func (hc *HealthCheck) UpdateLastSuccessfulRun(timestamp time.Time) {
|
||||
hc.mutex.Lock()
|
||||
defer hc.mutex.Unlock()
|
||||
if timestamp.After(hc.lastSuccessfulRun) {
|
||||
hc.lastSuccessfulRun = timestamp
|
||||
}
|
||||
// finishing successful run is also a sign of activity
|
||||
if timestamp.After(hc.lastActivity) {
|
||||
hc.lastActivity = timestamp
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,48 +24,52 @@ import (
|
|||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func getTestResponse(start time.Time, timeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder {
|
||||
func getTestResponse(start time.Time, activityTimeout, successTimeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder {
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck := NewHealthCheck(timeout)
|
||||
healthCheck := NewHealthCheck(activityTimeout, successTimeout)
|
||||
if checkMonitoring {
|
||||
healthCheck.StartMonitoring()
|
||||
}
|
||||
healthCheck.lastActivity = start
|
||||
healthCheck.lastSuccessfulRun = start
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
return w
|
||||
}
|
||||
|
||||
func TestOkServeHTTP(t *testing.T) {
|
||||
w := getTestResponse(time.Now(), time.Second, true)
|
||||
w := getTestResponse(time.Now(), time.Second, time.Second, true)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestFailServeHTTP(t *testing.T) {
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, true)
|
||||
func TestFailTimeoutServeHTTP(t *testing.T) {
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, true)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
}
|
||||
|
||||
func TestMonitoringOffAfterTimeout(t *testing.T) {
|
||||
w := getTestResponse(time.Now(), time.Second, false)
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, false)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestMonitoringOffBeforeTimeout(t *testing.T) {
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, false)
|
||||
w := getTestResponse(time.Now().Add(time.Second*2), time.Second, time.Second, false)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestUpdateLastActivity(t *testing.T) {
|
||||
timeout := time.Second
|
||||
start := time.Now().Add(timeout * -2)
|
||||
// to make sure it doesn't cause health check failure
|
||||
lastSuccess := time.Now().Add(timeout * 10)
|
||||
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck := NewHealthCheck(timeout)
|
||||
healthCheck := NewHealthCheck(timeout, timeout)
|
||||
healthCheck.StartMonitoring()
|
||||
healthCheck.lastActivity = start
|
||||
healthCheck.lastSuccessfulRun = lastSuccess
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
|
||||
|
|
@ -74,3 +78,53 @@ func TestUpdateLastActivity(t *testing.T) {
|
|||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestUpdateActivityAtUpdateLastSuccessfulRun(t *testing.T) {
|
||||
timeout := time.Second
|
||||
start := time.Now().Add(timeout * -2)
|
||||
// to make sure it doesn't cause health check failure
|
||||
lastSuccess := time.Now().Add(timeout * 10)
|
||||
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
healthCheck := NewHealthCheck(timeout, timeout)
|
||||
healthCheck.StartMonitoring()
|
||||
healthCheck.lastActivity = start
|
||||
healthCheck.lastSuccessfulRun = lastSuccess
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
|
||||
w = httptest.NewRecorder()
|
||||
healthCheck.UpdateLastSuccessfulRun(time.Now())
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
|
||||
// verify last successful run from the future wasn't overwritten
|
||||
assert.Equal(t, true, healthCheck.lastSuccessfulRun.After(healthCheck.lastActivity))
|
||||
}
|
||||
|
||||
func TestUpdateLastSuccessfulRun(t *testing.T) {
|
||||
timeout := time.Second
|
||||
start := time.Now().Add(timeout * -2)
|
||||
// to make sure it doesn't cause health check failure
|
||||
lastActivity := time.Now().Add(timeout * 10)
|
||||
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
healthCheck := NewHealthCheck(timeout, timeout)
|
||||
healthCheck.StartMonitoring()
|
||||
healthCheck.lastActivity = lastActivity
|
||||
healthCheck.lastSuccessfulRun = start
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
|
||||
w = httptest.NewRecorder()
|
||||
healthCheck.UpdateLastSuccessfulRun(time.Now())
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
|
||||
// verify last activity timestamp from the future wasn't overwritten
|
||||
assert.Equal(t, true, healthCheck.lastActivity.After(healthCheck.lastSuccessfulRun))
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue