203 lines
5.3 KiB
Go
203 lines
5.3 KiB
Go
package culler
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"time"
|
|
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
logf "sigs.k8s.io/controller-runtime/pkg/runtime/log"
|
|
)
|
|
|
|
var log = logf.Log.WithName("culler")
|
|
var client = &http.Client{
|
|
Timeout: time.Second * 10,
|
|
}
|
|
|
|
// The constants with name 'DEFAULT_{ENV_Var}' are the default values to be
|
|
// used, if the respective ENV vars are not present.
|
|
// All the time numbers correspond to minutes.
|
|
const DEFAULT_IDLE_TIME = "1440" // One day
|
|
const DEFAULT_CULLING_CHECK_PERIOD = "1"
|
|
const DEFAULT_ENABLE_CULLING = "false"
|
|
const DEFAULT_CLUSTER_DOMAIN = "cluster.local"
|
|
|
|
// When a Resource should be stopped/culled, then the controller should add this
|
|
// annotation in the Resource's Metadata. Then, inside the reconcile loop,
|
|
// the controller must check if this annotation is set and then apply the
|
|
// respective culling logic for that Resource. The value of the annotation will
|
|
// be a timestamp of when the Resource was stopped/culled.
|
|
//
|
|
// In case of Notebooks, the controller will reduce the replicas to 0 if
|
|
// this annotation is set. If it's not set, then it will make the replicas 1.
|
|
const STOP_ANNOTATION = "kubeflow-resource-stopped"
|
|
|
|
type NotebookStatus struct {
|
|
Started string `json:"started"`
|
|
LastActivity string `json:"last_activity"`
|
|
Connections int `json:"connections"`
|
|
Kernels int `json:"kernels"`
|
|
}
|
|
|
|
// Some Utility Functions
|
|
func getEnvDefault(variable string, defaultVal string) string {
|
|
envVar := os.Getenv(variable)
|
|
if len(envVar) == 0 {
|
|
return defaultVal
|
|
}
|
|
return envVar
|
|
}
|
|
|
|
// Time / Frequency Utility functions
|
|
func createTimestamp() string {
|
|
now := time.Now()
|
|
return now.Format(time.RFC3339)
|
|
}
|
|
|
|
func GetRequeueTime() time.Duration {
|
|
// The frequency in which we check if the Pod needs culling
|
|
// Uses ENV var: CULLING_CHECK_PERIOD
|
|
cullingPeriod := getEnvDefault(
|
|
"CULLING_CHECK_PERIOD", DEFAULT_CULLING_CHECK_PERIOD)
|
|
realCullingPeriod, err := strconv.Atoi(cullingPeriod)
|
|
if err != nil {
|
|
log.Info(fmt.Sprintf(
|
|
"Culling Period should be Int. Got '%s'. Using default value.",
|
|
cullingPeriod))
|
|
realCullingPeriod, _ = strconv.Atoi(DEFAULT_CULLING_CHECK_PERIOD)
|
|
}
|
|
|
|
return time.Duration(realCullingPeriod) * time.Minute
|
|
}
|
|
|
|
func getMaxIdleTime() time.Duration {
|
|
idleTime := getEnvDefault("IDLE_TIME", DEFAULT_IDLE_TIME)
|
|
realIdleTime, err := strconv.Atoi(idleTime)
|
|
if err != nil {
|
|
log.Info(fmt.Sprintf(
|
|
"IDLE_TIME should be Int. Got %s instead. Using default value.",
|
|
idleTime))
|
|
realIdleTime, _ = strconv.Atoi(DEFAULT_IDLE_TIME)
|
|
}
|
|
|
|
return time.Minute * time.Duration(realIdleTime)
|
|
}
|
|
|
|
// Stop Annotation handling functions
|
|
func SetStopAnnotation(meta *metav1.ObjectMeta) {
|
|
if meta == nil {
|
|
log.Info("Error: Metadata is Nil. Can't set Annotations")
|
|
return
|
|
}
|
|
|
|
if meta.GetAnnotations() != nil {
|
|
meta.Annotations[STOP_ANNOTATION] = createTimestamp()
|
|
} else {
|
|
meta.SetAnnotations(map[string]string{
|
|
STOP_ANNOTATION: createTimestamp(),
|
|
})
|
|
}
|
|
}
|
|
|
|
// TODO: add remove notebooks count metric
|
|
func RemoveStopAnnotation(meta *metav1.ObjectMeta) {
|
|
if meta == nil {
|
|
log.Info("Error: Metadata is Nil. Can't remove Annotations")
|
|
return
|
|
}
|
|
|
|
if meta.GetAnnotations() == nil {
|
|
return
|
|
}
|
|
|
|
if _, ok := meta.GetAnnotations()[STOP_ANNOTATION]; ok {
|
|
delete(meta.GetAnnotations(), STOP_ANNOTATION)
|
|
}
|
|
}
|
|
|
|
func StopAnnotationIsSet(meta metav1.ObjectMeta) bool {
|
|
if meta.GetAnnotations() == nil {
|
|
return false
|
|
}
|
|
|
|
if _, ok := meta.GetAnnotations()[STOP_ANNOTATION]; ok {
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Culling Logic
|
|
func getNotebookApiStatus(nm, ns string) *NotebookStatus {
|
|
// Get the Notebook Status from the Server's /api/status endpoint
|
|
domain := getEnvDefault("CLUSTER_DOMAIN", DEFAULT_CLUSTER_DOMAIN)
|
|
url := fmt.Sprintf(
|
|
"http://%s.%s.svc.%s/notebook/%s/%s/api/status",
|
|
nm, ns, domain, ns, nm)
|
|
|
|
resp, err := client.Get(url)
|
|
if err != nil {
|
|
log.Info(fmt.Sprintf("Error talking to %s", url), "error", err)
|
|
return nil
|
|
}
|
|
|
|
// Decode the body
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
log.Info(fmt.Sprintf(
|
|
"Warning: GET to %s: %d", url, resp.StatusCode))
|
|
return nil
|
|
}
|
|
|
|
status := new(NotebookStatus)
|
|
err = json.NewDecoder(resp.Body).Decode(status)
|
|
if err != nil {
|
|
log.Info(fmt.Sprintf(
|
|
"Error parsing the JSON response for Notebook %s/%s", nm, ns),
|
|
"error", err)
|
|
return nil
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
func notebookIsIdle(nm, ns string, status *NotebookStatus) bool {
|
|
// Being idle means that the Notebook can be culled
|
|
if status == nil {
|
|
return false
|
|
}
|
|
|
|
lastActivity, err := time.Parse(time.RFC3339, status.LastActivity)
|
|
if err != nil {
|
|
log.Info(fmt.Sprintf("Error parsing time for Notebook %s/%s", nm, ns),
|
|
"error", err)
|
|
return false
|
|
}
|
|
|
|
timeCap := lastActivity.Add(getMaxIdleTime())
|
|
if time.Now().After(timeCap) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func NotebookNeedsCulling(nbMeta metav1.ObjectMeta) bool {
|
|
if getEnvDefault("ENABLE_CULLING", DEFAULT_ENABLE_CULLING) != "true" {
|
|
log.Info("Culling of idle Pods is Disabled. To enable it set the " +
|
|
"ENV Var 'ENABLE_CULLING=true'")
|
|
return false
|
|
}
|
|
|
|
nm, ns := nbMeta.GetName(), nbMeta.GetNamespace()
|
|
if StopAnnotationIsSet(nbMeta) {
|
|
log.Info(fmt.Sprintf("Notebook %s/%s is already stopping", ns, nm))
|
|
return false
|
|
}
|
|
|
|
notebookStatus := getNotebookApiStatus(nm, ns)
|
|
return notebookIsIdle(nm, ns, notebookStatus)
|
|
}
|