notebooks/components/crud-web-apps/jupyter/backend/apps/common/status.py

100 lines
3.5 KiB
Python

import datetime as dt
from kubeflow.kubeflow.crud_backend import api, status
EVENT_TYPE_WARNING = "Warning"
STOP_ANNOTATION = "kubeflow-resource-stopped"
def process_status(notebook):
"""
Return status and reason. Status may be [running|waiting|warning|error]
"""
# Check if the Notebook is stopped
readyReplicas = notebook.get("status", {}).get("readyReplicas", 0)
metadata = notebook.get("metadata", {})
annotations = metadata.get("annotations", {})
if STOP_ANNOTATION in annotations:
if readyReplicas == 0:
return status.create_status(
status.STATUS_PHASE.STOPPED,
"No Pods are currently running for this Notebook Server.",
)
else:
return status.create_status(
status.STATUS_PHASE.TERMINATING, "Notebook Server is stopping."
)
# If the Notebook is being deleted, the status will be waiting
if "deletionTimestamp" in metadata:
return status.create_status(
status.STATUS_PHASE.TERMINATING, "Deleting this notebook server"
)
# Check the status
state = notebook.get("status", {}).get("containerState", "")
# Use conditions on the Jupyter notebook (i.e., s) to determine overall
# status. If no container state is available, we try to extract information
# about why the notebook is not starting from the notebook's events
# (see find_error_event)
if readyReplicas == 1:
return status.create_status(status.STATUS_PHASE.READY, "Running")
if "waiting" in state:
return status.create_status(
status.STATUS_PHASE.WAITING, state["waiting"]["reason"]
)
# Provide the user with detailed information (if any) about
# why the notebook is not starting
notebook_events = get_notebook_events(notebook)
status_val, reason = status.STATUS_PHASE.WAITING, "Scheduling the Pod"
status_event, reason_event = find_error_event(notebook_events)
if status_event is not None:
status_val, reason = status_event, reason_event
return status.create_status(status_val, reason)
def get_notebook_events(notebook):
name = notebook["metadata"]["name"]
namespace = notebook["metadata"]["namespace"]
nb_creation_time = dt.datetime.strptime(
notebook["metadata"]["creationTimestamp"], "%Y-%m-%dT%H:%M:%SZ"
)
nb_events = api.list_notebook_events(name, namespace).items
# User can delete and then create a nb server with the same name
# Make sure previous events are not taken into account
nb_events = filter(
lambda e: event_timestamp(e) >= nb_creation_time, nb_events,
)
return nb_events
def find_error_event(notebook_events):
"""
Returns status and reason from the latest event that surfaces the cause
of why the resource could not be created. For a Notebook, it can be due to:
EVENT_TYPE EVENT_REASON DESCRIPTION
Warning FailedCreate pods "x" is forbidden: error
looking up service account ... (originated in statefulset)
Warning FailedScheduling 0/1 nodes are available: 1
Insufficient cpu (originated in pod)
"""
for e in sorted(notebook_events, key=event_timestamp, reverse=True):
if e.type == EVENT_TYPE_WARNING:
return status.STATUS_PHASE.WAITING, e.message
return None, None
def event_timestamp(event):
return event.metadata.creation_timestamp.replace(tzinfo=None)