From 1d24b75f57722ffe8df01a0067d5453521adb243 Mon Sep 17 00:00:00 2001 From: Kimonas Sotirchos Date: Wed, 9 Feb 2022 19:32:07 +0200 Subject: [PATCH] notebooks: Fix endless restarts (kubeflow/kubeflow#6341) * notebooks: Update notebook if timestamp changed We don't want to be updating the spec of the notebook if the timestamp hasn't changed, since this will lead to constant updates and reconciliation loops. Signed-off-by: Kimonas Sotirchos * notebooks: Use a deep-copy of the notebook spec The controller should use a deep-copy of the notebook spec when calculating the spec for the StatefulSet. If not then we could update the notebook object without wanting it, since the spec could have been changed when calculating the STS spec. Signed-off-by: Kimonas Sotirchos * notebooks: Add prefix env var only if missing The controller should be setting OR updating the NB_PREFIX env var. Previously it would always blindly append it to the spec, which could result in double entries for the same env var. Signed-off-by: Kimonas Sotirchos --- .../controllers/notebook_controller.go | 36 +++++++++++++------ .../notebook-controller/pkg/culler/culler.go | 21 +++++------ 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/components/notebook-controller/controllers/notebook_controller.go b/components/notebook-controller/controllers/notebook_controller.go index b133fa29..71815226 100644 --- a/components/notebook-controller/controllers/notebook_controller.go +++ b/components/notebook-controller/controllers/notebook_controller.go @@ -50,6 +50,8 @@ const DefaultServingPort = 80 const AnnotationRewriteURI = "notebooks.kubeflow.org/http-rewrite-uri" const AnnotationHeadersRequestSet = "notebooks.kubeflow.org/http-headers-request-set" +const PrefixEnvVar = "NB_PREFIX" + // The default fsGroup of PodSecurityContext. // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#podsecuritycontext-v1-core const DefaultFSGroup = int64(100) @@ -286,11 +288,11 @@ func (r *NotebookReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { // Pod is found // Check if the Notebook needs to be stopped // Update the LAST_ACTIVITY_ANNOTATION - culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta) - - err = r.Update(ctx, instance) - if err != nil { - return ctrl.Result{}, err + if culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta) { + err = r.Update(ctx, instance) + if err != nil { + return ctrl.Result{}, err + } } // Check if the Notebook needs to be stopped @@ -341,6 +343,22 @@ func getNextCondition(cs corev1.ContainerState) v1beta1.NotebookCondition { return newCondition } +func setPrefixEnvVar(instance *v1beta1.Notebook, container *corev1.Container) { + prefix := "/notebook/" + instance.Namespace + "/" + instance.Name + + for _, envVar := range container.Env { + if envVar.Name == PrefixEnvVar { + envVar.Value = prefix + return + } + } + + container.Env = append(container.Env, corev1.EnvVar{ + Name: PrefixEnvVar, + Value: prefix, + }) +} + func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet { replicas := int32(1) if culler.StopAnnotationIsSet(instance.ObjectMeta) { @@ -364,7 +382,7 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet { "statefulset": instance.Name, "notebook-name": instance.Name, }}, - Spec: instance.Spec.Template.Spec, + Spec: *instance.Spec.Template.Spec.DeepCopy(), }, }, } @@ -388,10 +406,8 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet { }, } } - container.Env = append(container.Env, corev1.EnvVar{ - Name: "NB_PREFIX", - Value: "/notebook/" + instance.Namespace + "/" + instance.Name, - }) + + setPrefixEnvVar(instance, container) // For some platforms (like OpenShift), adding fsGroup: 100 is troublesome. // This allows for those platforms to bypass the automatic addition of the fsGroup diff --git a/components/notebook-controller/pkg/culler/culler.go b/components/notebook-controller/pkg/culler/culler.go index 3fed9f48..a2feab2c 100644 --- a/components/notebook-controller/pkg/culler/culler.go +++ b/components/notebook-controller/pkg/culler/culler.go @@ -204,11 +204,11 @@ func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool { } // Update LAST_ACTIVITY_ANNOTATION -func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) { +func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) bool { log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta)) if meta == nil { log.Info("Metadata is Nil. Can't update Last Activity Annotation.") - return + return false } log.Info("Updating the last-activity annotation.") @@ -223,25 +223,25 @@ func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) { meta.SetAnnotations(map[string]string{}) } meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t - return + return true } log.Info("last-activity annotation exists. Checking /api/kernels") kernels := getNotebookApiKernels(nm, ns) if kernels == nil { log.Info("Could not GET the kernels status. Will not update last-activity.") - return + return false } - updateTimestampFromKernelsActivity(meta, kernels) + return updateTimestampFromKernelsActivity(meta, kernels) } -func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) { +func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) bool { log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta)) if len(kernels) == 0 { log.Info("Notebook has no kernels. Will not update last-activity") - return + return false } if !allKernelsAreIdle(kernels, log) { @@ -251,7 +251,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne log.Info(fmt.Sprintf("Found a busy kernel. Updating the last-activity to %s", t)) meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t - return + return true } // Checking for the most recent kernel last_activity. The LAST_ACTIVITY_ANNOTATION @@ -259,14 +259,14 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne recentTime, err := time.Parse(time.RFC3339, kernels[0].LastActivity) if err != nil { log.Error(err, "Error parsing the last-activity from the /api/kernels") - return + return false } for i := 1; i < len(kernels); i++ { kernelLastActivity, err := time.Parse(time.RFC3339, kernels[i].LastActivity) if err != nil { log.Error(err, "Error parsing the last-activity from the /api/kernels") - return + return false } if kernelLastActivity.After(recentTime) { recentTime = kernelLastActivity @@ -276,6 +276,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t)) + return true } func notebookIsIdle(meta metav1.ObjectMeta) bool {