notebooks/components/notebook-controller/pkg/culler/culler.go

203 lines
5.3 KiB
Go

package culler
import (
"encoding/json"
"fmt"
"net/http"
"os"
"strconv"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
logf "sigs.k8s.io/controller-runtime/pkg/runtime/log"
)
var log = logf.Log.WithName("culler")
var client = &http.Client{
Timeout: time.Second * 10,
}
// The constants with name 'DEFAULT_{ENV_Var}' are the default values to be
// used, if the respective ENV vars are not present.
// All the time numbers correspond to minutes.
const DEFAULT_IDLE_TIME = "1440" // One day
const DEFAULT_CULLING_CHECK_PERIOD = "1"
const DEFAULT_ENABLE_CULLING = "false"
const DEFAULT_CLUSTER_DOMAIN = "cluster.local"
// When a Resource should be stopped/culled, then the controller should add this
// annotation in the Resource's Metadata. Then, inside the reconcile loop,
// the controller must check if this annotation is set and then apply the
// respective culling logic for that Resource. The value of the annotation will
// be a timestamp of when the Resource was stopped/culled.
//
// In case of Notebooks, the controller will reduce the replicas to 0 if
// this annotation is set. If it's not set, then it will make the replicas 1.
const STOP_ANNOTATION = "kubeflow-resource-stopped"
type NotebookStatus struct {
Started string `json:"started"`
LastActivity string `json:"last_activity"`
Connections int `json:"connections"`
Kernels int `json:"kernels"`
}
// Some Utility Functions
func getEnvDefault(variable string, defaultVal string) string {
envVar := os.Getenv(variable)
if len(envVar) == 0 {
return defaultVal
}
return envVar
}
// Time / Frequency Utility functions
func createTimestamp() string {
now := time.Now()
return now.Format(time.RFC3339)
}
func GetRequeueTime() time.Duration {
// The frequency in which we check if the Pod needs culling
// Uses ENV var: CULLING_CHECK_PERIOD
cullingPeriod := getEnvDefault(
"CULLING_CHECK_PERIOD", DEFAULT_CULLING_CHECK_PERIOD)
realCullingPeriod, err := strconv.Atoi(cullingPeriod)
if err != nil {
log.Info(fmt.Sprintf(
"Culling Period should be Int. Got '%s'. Using default value.",
cullingPeriod))
realCullingPeriod, _ = strconv.Atoi(DEFAULT_CULLING_CHECK_PERIOD)
}
return time.Duration(realCullingPeriod) * time.Minute
}
func getMaxIdleTime() time.Duration {
idleTime := getEnvDefault("IDLE_TIME", DEFAULT_IDLE_TIME)
realIdleTime, err := strconv.Atoi(idleTime)
if err != nil {
log.Info(fmt.Sprintf(
"IDLE_TIME should be Int. Got %s instead. Using default value.",
idleTime))
realIdleTime, _ = strconv.Atoi(DEFAULT_IDLE_TIME)
}
return time.Minute * time.Duration(realIdleTime)
}
// Stop Annotation handling functions
func SetStopAnnotation(meta *metav1.ObjectMeta) {
if meta == nil {
log.Info("Error: Metadata is Nil. Can't set Annotations")
return
}
if meta.GetAnnotations() != nil {
meta.Annotations[STOP_ANNOTATION] = createTimestamp()
} else {
meta.SetAnnotations(map[string]string{
STOP_ANNOTATION: createTimestamp(),
})
}
}
// TODO: add remove notebooks count metric
func RemoveStopAnnotation(meta *metav1.ObjectMeta) {
if meta == nil {
log.Info("Error: Metadata is Nil. Can't remove Annotations")
return
}
if meta.GetAnnotations() == nil {
return
}
if _, ok := meta.GetAnnotations()[STOP_ANNOTATION]; ok {
delete(meta.GetAnnotations(), STOP_ANNOTATION)
}
}
func StopAnnotationIsSet(meta metav1.ObjectMeta) bool {
if meta.GetAnnotations() == nil {
return false
}
if _, ok := meta.GetAnnotations()[STOP_ANNOTATION]; ok {
return true
} else {
return false
}
}
// Culling Logic
func getNotebookApiStatus(nm, ns string) *NotebookStatus {
// Get the Notebook Status from the Server's /api/status endpoint
domain := getEnvDefault("CLUSTER_DOMAIN", DEFAULT_CLUSTER_DOMAIN)
url := fmt.Sprintf(
"http://%s.%s.svc.%s/notebook/%s/%s/api/status",
nm, ns, domain, ns, nm)
resp, err := client.Get(url)
if err != nil {
log.Info(fmt.Sprintf("Error talking to %s", url), "error", err)
return nil
}
// Decode the body
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Info(fmt.Sprintf(
"Warning: GET to %s: %d", url, resp.StatusCode))
return nil
}
status := new(NotebookStatus)
err = json.NewDecoder(resp.Body).Decode(status)
if err != nil {
log.Info(fmt.Sprintf(
"Error parsing the JSON response for Notebook %s/%s", nm, ns),
"error", err)
return nil
}
return status
}
func notebookIsIdle(nm, ns string, status *NotebookStatus) bool {
// Being idle means that the Notebook can be culled
if status == nil {
return false
}
lastActivity, err := time.Parse(time.RFC3339, status.LastActivity)
if err != nil {
log.Info(fmt.Sprintf("Error parsing time for Notebook %s/%s", nm, ns),
"error", err)
return false
}
timeCap := lastActivity.Add(getMaxIdleTime())
if time.Now().After(timeCap) {
return true
}
return false
}
func NotebookNeedsCulling(nbMeta metav1.ObjectMeta) bool {
if getEnvDefault("ENABLE_CULLING", DEFAULT_ENABLE_CULLING) != "true" {
log.Info("Culling of idle Pods is Disabled. To enable it set the " +
"ENV Var 'ENABLE_CULLING=true'")
return false
}
nm, ns := nbMeta.GetName(), nbMeta.GetNamespace()
if StopAnnotationIsSet(nbMeta) {
log.Info(fmt.Sprintf("Notebook %s/%s is already stopping", ns, nm))
return false
}
notebookStatus := getNotebookApiStatus(nm, ns)
return notebookIsIdle(nm, ns, notebookStatus)
}