Add health check
This commit is contained in:
parent
9083c135c6
commit
7c94367099
|
@ -99,6 +99,7 @@ var (
|
|||
"Type of node group expander to be used in scale up. Available values: ["+strings.Join(expander.AvailableExpanders, ",")+"]")
|
||||
|
||||
writeStatusConfigMapFlag = flag.Bool("write-status-configmap", true, "Should CA write status information to a configmap")
|
||||
maxInactivityFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
|
||||
)
|
||||
|
||||
func createAutoscalerOptions() core.AutoscalerOptions {
|
||||
|
@ -166,10 +167,7 @@ func registerSignalHandlers(autoscaler core.Autoscaler) {
|
|||
}()
|
||||
}
|
||||
|
||||
// In order to meet interface criteria for LeaderElectionConfig we need to
|
||||
// take stop channel as an argument. However, since we are committing a suicide
|
||||
// after loosing mastership we can safely ignore it.
|
||||
func run(_ <-chan struct{}) {
|
||||
func run(healthCheck *metrics.HealthCheck) {
|
||||
kubeClient := createKubeClient()
|
||||
kubeEventRecorder := kube_util.CreateEventRecorder(kubeClient)
|
||||
opts := createAutoscalerOptions()
|
||||
|
@ -184,6 +182,7 @@ func run(_ <-chan struct{}) {
|
|||
|
||||
autoscaler.CleanUp()
|
||||
registerSignalHandlers(autoscaler)
|
||||
healthCheck.StartMonitoring()
|
||||
|
||||
for {
|
||||
select {
|
||||
|
@ -191,6 +190,7 @@ func run(_ <-chan struct{}) {
|
|||
{
|
||||
loopStart := time.Now()
|
||||
metrics.UpdateLastTime("main", loopStart)
|
||||
healthCheck.UpdateLastActivity(loopStart)
|
||||
|
||||
err := autoscaler.RunOnce(loopStart)
|
||||
if err != nil && err.Type() != errors.TransientError {
|
||||
|
@ -212,6 +212,8 @@ func main() {
|
|||
"Can be used multiple times. Format: <min>:<max>:<other...>")
|
||||
kube_flag.InitFlags()
|
||||
|
||||
healthCheck := metrics.NewHealthCheck(*maxInactivityFlag)
|
||||
|
||||
glog.Infof("Cluster Autoscaler %s", ClusterAutoscalerVersion)
|
||||
|
||||
correctEstimator := false
|
||||
|
@ -226,12 +228,13 @@ func main() {
|
|||
|
||||
go func() {
|
||||
http.Handle("/metrics", prometheus.Handler())
|
||||
http.Handle("/health-check", healthCheck)
|
||||
err := http.ListenAndServe(*address, nil)
|
||||
glog.Fatalf("Failed to start metrics: %v", err)
|
||||
}()
|
||||
|
||||
if !leaderElection.LeaderElect {
|
||||
run(nil)
|
||||
run(healthCheck)
|
||||
} else {
|
||||
id, err := os.Hostname()
|
||||
if err != nil {
|
||||
|
@ -262,7 +265,11 @@ func main() {
|
|||
RenewDeadline: leaderElection.RenewDeadline.Duration,
|
||||
RetryPeriod: leaderElection.RetryPeriod.Duration,
|
||||
Callbacks: kube_leaderelection.LeaderCallbacks{
|
||||
OnStartedLeading: run,
|
||||
OnStartedLeading: func(_ <-chan struct{}) {
|
||||
// Since we are committing a suicide after losing
|
||||
// mastership, we can safely ignore the argument.
|
||||
run(healthCheck)
|
||||
},
|
||||
OnStoppedLeading: func() {
|
||||
glog.Fatalf("lost master")
|
||||
},
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// HealthCheck contains information about last time of autoscaler activity and timeout
|
||||
type HealthCheck struct {
|
||||
lastActivity time.Time
|
||||
mutex *sync.Mutex
|
||||
timeout time.Duration
|
||||
checkTimeout bool
|
||||
}
|
||||
|
||||
// NewHealthCheck builds new HealthCheck object with given timeout
|
||||
func NewHealthCheck(timeout time.Duration) *HealthCheck {
|
||||
return &HealthCheck{
|
||||
lastActivity: time.Now(),
|
||||
mutex: &sync.Mutex{},
|
||||
timeout: timeout,
|
||||
checkTimeout: false,
|
||||
}
|
||||
}
|
||||
|
||||
// StartMonitoring activates checks for autoscaler inactivity
|
||||
func (hc *HealthCheck) StartMonitoring() {
|
||||
hc.mutex.Lock()
|
||||
defer hc.mutex.Unlock()
|
||||
hc.checkTimeout = true
|
||||
now := time.Now()
|
||||
if now.After(hc.lastActivity) {
|
||||
hc.lastActivity = now
|
||||
}
|
||||
}
|
||||
|
||||
// ServeHTTP implements http.Handler interface to provide a health-check endpoint
|
||||
func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
hc.mutex.Lock()
|
||||
lastActivity := hc.lastActivity
|
||||
timedOut := hc.checkTimeout && time.Now().After(lastActivity.Add(hc.timeout))
|
||||
hc.mutex.Unlock()
|
||||
|
||||
if timedOut {
|
||||
w.WriteHeader(500)
|
||||
w.Write([]byte(fmt.Sprintf("Error: last activity more than %v ago", time.Now().Sub(lastActivity).String())))
|
||||
} else {
|
||||
w.WriteHeader(200)
|
||||
w.Write([]byte("OK"))
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateLastActivity updates last time of activity
|
||||
func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time) {
|
||||
hc.mutex.Lock()
|
||||
defer hc.mutex.Unlock()
|
||||
if timestamp.After(hc.lastActivity) {
|
||||
hc.lastActivity = timestamp
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func getTestResponse(start time.Time, timeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder {
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck := NewHealthCheck(timeout)
|
||||
if checkMonitoring {
|
||||
healthCheck.StartMonitoring()
|
||||
}
|
||||
healthCheck.lastActivity = start
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
return w
|
||||
}
|
||||
|
||||
func TestOkServeHTTP(t *testing.T) {
|
||||
w := getTestResponse(time.Now(), time.Second, true)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestFailServeHTTP(t *testing.T) {
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, true)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
}
|
||||
|
||||
func TestMonitoringOffAfterTimeout(t *testing.T) {
|
||||
w := getTestResponse(time.Now(), time.Second, false)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestMonitoringOffBeforeTimeout(t *testing.T) {
|
||||
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, false)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
||||
|
||||
func TestUpdateLastActivity(t *testing.T) {
|
||||
timeout := time.Second
|
||||
start := time.Now().Add(timeout * -2)
|
||||
|
||||
req := httptest.NewRequest("GET", "/health-check", nil)
|
||||
w := httptest.NewRecorder()
|
||||
healthCheck := NewHealthCheck(timeout)
|
||||
healthCheck.StartMonitoring()
|
||||
healthCheck.lastActivity = start
|
||||
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 500, w.Code)
|
||||
|
||||
w = httptest.NewRecorder()
|
||||
healthCheck.UpdateLastActivity(time.Now())
|
||||
healthCheck.ServeHTTP(w, req)
|
||||
assert.Equal(t, 200, w.Code)
|
||||
}
|
Loading…
Reference in New Issue