notebooks: Fix endless restarts (kubeflow/kubeflow#6341)

* notebooks: Update notebook if timestamp changed

We don't want to be updating the spec of the notebook if the timestamp
hasn't changed, since this will lead to constant updates and
reconciliation loops.

Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com>

* notebooks: Use a deep-copy of the notebook spec

The controller should use a deep-copy of the notebook spec when
calculating the spec for the StatefulSet. If not then we could
update the notebook object without wanting it, since the spec could have
been changed when calculating the STS spec.

Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com>

* notebooks: Add prefix env var only if missing

The controller should be setting OR updating the NB_PREFIX env var.
Previously it would always blindly append it to the spec, which could
result in double entries for the same env var.

Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com>
This commit is contained in:
Kimonas Sotirchos 2022-02-09 19:32:07 +02:00 committed by GitHub
parent 9510a2b913
commit 1d24b75f57
2 changed files with 37 additions and 20 deletions

View File

@ -50,6 +50,8 @@ const DefaultServingPort = 80
const AnnotationRewriteURI = "notebooks.kubeflow.org/http-rewrite-uri"
const AnnotationHeadersRequestSet = "notebooks.kubeflow.org/http-headers-request-set"
const PrefixEnvVar = "NB_PREFIX"
// The default fsGroup of PodSecurityContext.
// https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#podsecuritycontext-v1-core
const DefaultFSGroup = int64(100)
@ -286,11 +288,11 @@ func (r *NotebookReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
// Pod is found
// Check if the Notebook needs to be stopped
// Update the LAST_ACTIVITY_ANNOTATION
culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta)
err = r.Update(ctx, instance)
if err != nil {
return ctrl.Result{}, err
if culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta) {
err = r.Update(ctx, instance)
if err != nil {
return ctrl.Result{}, err
}
}
// Check if the Notebook needs to be stopped
@ -341,6 +343,22 @@ func getNextCondition(cs corev1.ContainerState) v1beta1.NotebookCondition {
return newCondition
}
func setPrefixEnvVar(instance *v1beta1.Notebook, container *corev1.Container) {
prefix := "/notebook/" + instance.Namespace + "/" + instance.Name
for _, envVar := range container.Env {
if envVar.Name == PrefixEnvVar {
envVar.Value = prefix
return
}
}
container.Env = append(container.Env, corev1.EnvVar{
Name: PrefixEnvVar,
Value: prefix,
})
}
func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
replicas := int32(1)
if culler.StopAnnotationIsSet(instance.ObjectMeta) {
@ -364,7 +382,7 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
"statefulset": instance.Name,
"notebook-name": instance.Name,
}},
Spec: instance.Spec.Template.Spec,
Spec: *instance.Spec.Template.Spec.DeepCopy(),
},
},
}
@ -388,10 +406,8 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
},
}
}
container.Env = append(container.Env, corev1.EnvVar{
Name: "NB_PREFIX",
Value: "/notebook/" + instance.Namespace + "/" + instance.Name,
})
setPrefixEnvVar(instance, container)
// For some platforms (like OpenShift), adding fsGroup: 100 is troublesome.
// This allows for those platforms to bypass the automatic addition of the fsGroup

View File

@ -204,11 +204,11 @@ func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool {
}
// Update LAST_ACTIVITY_ANNOTATION
func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) {
func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) bool {
log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta))
if meta == nil {
log.Info("Metadata is Nil. Can't update Last Activity Annotation.")
return
return false
}
log.Info("Updating the last-activity annotation.")
@ -223,25 +223,25 @@ func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) {
meta.SetAnnotations(map[string]string{})
}
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
return
return true
}
log.Info("last-activity annotation exists. Checking /api/kernels")
kernels := getNotebookApiKernels(nm, ns)
if kernels == nil {
log.Info("Could not GET the kernels status. Will not update last-activity.")
return
return false
}
updateTimestampFromKernelsActivity(meta, kernels)
return updateTimestampFromKernelsActivity(meta, kernels)
}
func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) {
func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) bool {
log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta))
if len(kernels) == 0 {
log.Info("Notebook has no kernels. Will not update last-activity")
return
return false
}
if !allKernelsAreIdle(kernels, log) {
@ -251,7 +251,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
log.Info(fmt.Sprintf("Found a busy kernel. Updating the last-activity to %s", t))
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
return
return true
}
// Checking for the most recent kernel last_activity. The LAST_ACTIVITY_ANNOTATION
@ -259,14 +259,14 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
recentTime, err := time.Parse(time.RFC3339, kernels[0].LastActivity)
if err != nil {
log.Error(err, "Error parsing the last-activity from the /api/kernels")
return
return false
}
for i := 1; i < len(kernels); i++ {
kernelLastActivity, err := time.Parse(time.RFC3339, kernels[i].LastActivity)
if err != nil {
log.Error(err, "Error parsing the last-activity from the /api/kernels")
return
return false
}
if kernelLastActivity.After(recentTime) {
recentTime = kernelLastActivity
@ -276,6 +276,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t))
return true
}
func notebookIsIdle(meta metav1.ObjectMeta) bool {