notebooks: Fix endless restarts (kubeflow/kubeflow#6341)
* notebooks: Update notebook if timestamp changed We don't want to be updating the spec of the notebook if the timestamp hasn't changed, since this will lead to constant updates and reconciliation loops. Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com> * notebooks: Use a deep-copy of the notebook spec The controller should use a deep-copy of the notebook spec when calculating the spec for the StatefulSet. If not then we could update the notebook object without wanting it, since the spec could have been changed when calculating the STS spec. Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com> * notebooks: Add prefix env var only if missing The controller should be setting OR updating the NB_PREFIX env var. Previously it would always blindly append it to the spec, which could result in double entries for the same env var. Signed-off-by: Kimonas Sotirchos <kimwnasptd@arrikto.com>
This commit is contained in:
parent
9510a2b913
commit
1d24b75f57
|
|
@ -50,6 +50,8 @@ const DefaultServingPort = 80
|
|||
const AnnotationRewriteURI = "notebooks.kubeflow.org/http-rewrite-uri"
|
||||
const AnnotationHeadersRequestSet = "notebooks.kubeflow.org/http-headers-request-set"
|
||||
|
||||
const PrefixEnvVar = "NB_PREFIX"
|
||||
|
||||
// The default fsGroup of PodSecurityContext.
|
||||
// https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#podsecuritycontext-v1-core
|
||||
const DefaultFSGroup = int64(100)
|
||||
|
|
@ -286,11 +288,11 @@ func (r *NotebookReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
|
|||
// Pod is found
|
||||
// Check if the Notebook needs to be stopped
|
||||
// Update the LAST_ACTIVITY_ANNOTATION
|
||||
culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta)
|
||||
|
||||
err = r.Update(ctx, instance)
|
||||
if err != nil {
|
||||
return ctrl.Result{}, err
|
||||
if culler.UpdateNotebookLastActivityAnnotation(&instance.ObjectMeta) {
|
||||
err = r.Update(ctx, instance)
|
||||
if err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the Notebook needs to be stopped
|
||||
|
|
@ -341,6 +343,22 @@ func getNextCondition(cs corev1.ContainerState) v1beta1.NotebookCondition {
|
|||
return newCondition
|
||||
}
|
||||
|
||||
func setPrefixEnvVar(instance *v1beta1.Notebook, container *corev1.Container) {
|
||||
prefix := "/notebook/" + instance.Namespace + "/" + instance.Name
|
||||
|
||||
for _, envVar := range container.Env {
|
||||
if envVar.Name == PrefixEnvVar {
|
||||
envVar.Value = prefix
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
container.Env = append(container.Env, corev1.EnvVar{
|
||||
Name: PrefixEnvVar,
|
||||
Value: prefix,
|
||||
})
|
||||
}
|
||||
|
||||
func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
|
||||
replicas := int32(1)
|
||||
if culler.StopAnnotationIsSet(instance.ObjectMeta) {
|
||||
|
|
@ -364,7 +382,7 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
|
|||
"statefulset": instance.Name,
|
||||
"notebook-name": instance.Name,
|
||||
}},
|
||||
Spec: instance.Spec.Template.Spec,
|
||||
Spec: *instance.Spec.Template.Spec.DeepCopy(),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
@ -388,10 +406,8 @@ func generateStatefulSet(instance *v1beta1.Notebook) *appsv1.StatefulSet {
|
|||
},
|
||||
}
|
||||
}
|
||||
container.Env = append(container.Env, corev1.EnvVar{
|
||||
Name: "NB_PREFIX",
|
||||
Value: "/notebook/" + instance.Namespace + "/" + instance.Name,
|
||||
})
|
||||
|
||||
setPrefixEnvVar(instance, container)
|
||||
|
||||
// For some platforms (like OpenShift), adding fsGroup: 100 is troublesome.
|
||||
// This allows for those platforms to bypass the automatic addition of the fsGroup
|
||||
|
|
|
|||
|
|
@ -204,11 +204,11 @@ func allKernelsAreIdle(kernels []KernelStatus, log logr.Logger) bool {
|
|||
}
|
||||
|
||||
// Update LAST_ACTIVITY_ANNOTATION
|
||||
func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) {
|
||||
func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) bool {
|
||||
log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta))
|
||||
if meta == nil {
|
||||
log.Info("Metadata is Nil. Can't update Last Activity Annotation.")
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
log.Info("Updating the last-activity annotation.")
|
||||
|
|
@ -223,25 +223,25 @@ func UpdateNotebookLastActivityAnnotation(meta *metav1.ObjectMeta) {
|
|||
meta.SetAnnotations(map[string]string{})
|
||||
}
|
||||
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
|
||||
return
|
||||
return true
|
||||
}
|
||||
|
||||
log.Info("last-activity annotation exists. Checking /api/kernels")
|
||||
kernels := getNotebookApiKernels(nm, ns)
|
||||
if kernels == nil {
|
||||
log.Info("Could not GET the kernels status. Will not update last-activity.")
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
updateTimestampFromKernelsActivity(meta, kernels)
|
||||
return updateTimestampFromKernelsActivity(meta, kernels)
|
||||
}
|
||||
|
||||
func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) {
|
||||
func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []KernelStatus) bool {
|
||||
log := log.WithValues("notebook", getNamespacedNameFromMeta(*meta))
|
||||
|
||||
if len(kernels) == 0 {
|
||||
log.Info("Notebook has no kernels. Will not update last-activity")
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
if !allKernelsAreIdle(kernels, log) {
|
||||
|
|
@ -251,7 +251,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
|
|||
log.Info(fmt.Sprintf("Found a busy kernel. Updating the last-activity to %s", t))
|
||||
|
||||
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
|
||||
return
|
||||
return true
|
||||
}
|
||||
|
||||
// Checking for the most recent kernel last_activity. The LAST_ACTIVITY_ANNOTATION
|
||||
|
|
@ -259,14 +259,14 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
|
|||
recentTime, err := time.Parse(time.RFC3339, kernels[0].LastActivity)
|
||||
if err != nil {
|
||||
log.Error(err, "Error parsing the last-activity from the /api/kernels")
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
for i := 1; i < len(kernels); i++ {
|
||||
kernelLastActivity, err := time.Parse(time.RFC3339, kernels[i].LastActivity)
|
||||
if err != nil {
|
||||
log.Error(err, "Error parsing the last-activity from the /api/kernels")
|
||||
return
|
||||
return false
|
||||
}
|
||||
if kernelLastActivity.After(recentTime) {
|
||||
recentTime = kernelLastActivity
|
||||
|
|
@ -276,6 +276,7 @@ func updateTimestampFromKernelsActivity(meta *metav1.ObjectMeta, kernels []Kerne
|
|||
|
||||
meta.Annotations[LAST_ACTIVITY_ANNOTATION] = t
|
||||
log.Info(fmt.Sprintf("Successfully updated last-activity from latest kernel action, %s", t))
|
||||
return true
|
||||
}
|
||||
|
||||
func notebookIsIdle(meta metav1.ObjectMeta) bool {
|
||||
|
|
|
|||
Loading…
Reference in New Issue