200 lines
7.4 KiB
Python
200 lines
7.4 KiB
Python
import datetime as dt
|
|
|
|
from kubeflow.kubeflow.crud_backend import api, status
|
|
|
|
EVENT_TYPE_WARNING = "Warning"
|
|
STOP_ANNOTATION = "kubeflow-resource-stopped"
|
|
|
|
|
|
def process_status(notebook):
|
|
"""
|
|
Return status and reason. Status may be:
|
|
[ready|waiting|warning|terminating|stopped]
|
|
"""
|
|
# In case the Notebook has no status
|
|
status_phase, status_message = get_empty_status(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# In case the Notebook is being stopped
|
|
status_phase, status_message = get_stopped_status(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# In case the Notebook is being deleted
|
|
status_phase, status_message = get_deleted_status(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# In case the Notebook is ready
|
|
status_phase, status_message = check_ready_nb(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# Extract information about the status from the containerState of the
|
|
# Notebook's status
|
|
status_phase, status_message = get_status_from_container_state(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# Extract information about the status from the conditions of the
|
|
# Notebook's status
|
|
status_phase, status_message = get_status_from_conditions(notebook)
|
|
if status_phase is not None:
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
# Try to extract information about why the notebook is not starting
|
|
# from the notebook's events (see find_error_event)
|
|
notebook_events = get_notebook_events(notebook)
|
|
status_event, reason_event = get_status_from_events(notebook_events)
|
|
if status_event is not None:
|
|
status_phase, status_message = status_event, reason_event
|
|
|
|
# In case there no Events available, show a generic message
|
|
status_phase = status.STATUS_PHASE.WARNING
|
|
status_message = "Couldn't find any information for the status of this notebook." # noqa: E501
|
|
|
|
return status.create_status(status_phase, status_message)
|
|
|
|
|
|
def get_empty_status(notebook):
|
|
creation_timestamp = notebook.get("metadata", {}).get("creationTimestamp")
|
|
notebook_status = notebook.get("status", {})
|
|
container_state = notebook_status.get("containerState")
|
|
conditions = notebook_status.get("conditions")
|
|
|
|
# Convert a date string of a format to datetime object
|
|
nb_creation_time = dt.datetime.strptime(
|
|
creation_timestamp, "%Y-%m-%dT%H:%M:%SZ")
|
|
current_time = dt.datetime.utcnow().replace(microsecond=0)
|
|
delta = (current_time - nb_creation_time)
|
|
|
|
# If the Notebook has no status, the status will be waiting
|
|
# (instead of warning) and we will show a generic message for the first 10
|
|
# seconds
|
|
if not container_state and not conditions and delta.total_seconds() <= 10:
|
|
status_phase = status.STATUS_PHASE.WAITING
|
|
status_message = "Waiting for StatefulSet to create the underlying Pod." # noqa: E501
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def get_stopped_status(notebook):
|
|
ready_replicas = notebook.get("status", {}).get("readyReplicas", 0)
|
|
metadata = notebook.get("metadata", {})
|
|
annotations = metadata.get("annotations", {})
|
|
|
|
if STOP_ANNOTATION in annotations:
|
|
# If the Notebook is stopped, the status will be stopped
|
|
if ready_replicas == 0:
|
|
status_phase = status.STATUS_PHASE.STOPPED
|
|
status_message = "No Pods are currently running for this Notebook Server." # noqa: E501
|
|
return status_phase, status_message
|
|
# If the Notebook is being stopped, the status will be waiting
|
|
else:
|
|
status_phase = status.STATUS_PHASE.WAITING
|
|
status_message = "Notebook Server is stopping."
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def get_deleted_status(notebook):
|
|
metadata = notebook.get("metadata", {})
|
|
|
|
# If the Notebook is being deleted, the status will be terminating
|
|
if "deletionTimestamp" in metadata:
|
|
status_phase = status.STATUS_PHASE.TERMINATING
|
|
status_message = "Deleting this Notebook Server."
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def check_ready_nb(notebook):
|
|
ready_replicas = notebook.get("status", {}).get("readyReplicas", 0)
|
|
|
|
# If the Notebook is running, the status will be ready
|
|
if ready_replicas == 1:
|
|
status_phase, status_message = status.STATUS_PHASE.READY, "Running"
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def get_status_from_container_state(notebook):
|
|
container_state = notebook.get("status", {}).get("containerState", {})
|
|
|
|
if "waiting" in container_state:
|
|
# If the Notebook is initializing, the status will be waiting
|
|
if container_state["waiting"]["reason"] == 'PodInitializing':
|
|
status_phase = status.STATUS_PHASE.WAITING
|
|
status_message = container_state["waiting"]["reason"]
|
|
return status_phase, status_message
|
|
# In any other case, the status will be warning with a "reason:
|
|
# message" showing on hover
|
|
else:
|
|
status_phase = status.STATUS_PHASE.WARNING
|
|
|
|
reason = container_state["waiting"]["reason"]
|
|
message = container_state["waiting"]["message"]
|
|
status_message = '%s: %s' % (reason, message)
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def get_status_from_conditions(notebook):
|
|
conditions = notebook.get("status", {}).get("conditions", [])
|
|
|
|
for condition in conditions:
|
|
# The status will be warning with a "reason: message" showing on hover
|
|
if "reason" in condition:
|
|
status_phase = status.STATUS_PHASE.WARNING
|
|
status_message = condition["reason"] + ': ' + condition["message"]
|
|
return status_phase, status_message
|
|
|
|
return None, None
|
|
|
|
|
|
def get_notebook_events(notebook):
|
|
name = notebook["metadata"]["name"]
|
|
namespace = notebook["metadata"]["namespace"]
|
|
|
|
nb_creation_time = dt.datetime.strptime(
|
|
notebook["metadata"]["creationTimestamp"], "%Y-%m-%dT%H:%M:%SZ"
|
|
)
|
|
|
|
nb_events = api.list_notebook_events(name, namespace).items
|
|
# User can delete and then create a nb server with the same name
|
|
# Make sure previous events are not taken into account
|
|
nb_events = filter(
|
|
lambda e: event_timestamp(e) >= nb_creation_time, nb_events,
|
|
)
|
|
|
|
return nb_events
|
|
|
|
|
|
def get_status_from_events(notebook_events):
|
|
"""
|
|
Returns status and reason from the latest event that surfaces the cause
|
|
of why the resource could not be created. For a Notebook, it can be due to:
|
|
|
|
EVENT_TYPE EVENT_REASON DESCRIPTION
|
|
Warning FailedCreate pods "x" is forbidden: error
|
|
looking up service account ... (originated in statefulset)
|
|
Warning FailedScheduling 0/1 nodes are available: 1
|
|
Insufficient cpu (originated in pod)
|
|
|
|
"""
|
|
for e in sorted(notebook_events, key=event_timestamp, reverse=True):
|
|
if e.type == EVENT_TYPE_WARNING:
|
|
return status.STATUS_PHASE.WARNING, e.message
|
|
|
|
return None, None
|
|
|
|
|
|
def event_timestamp(event):
|
|
return event.metadata.creation_timestamp.replace(tzinfo=None)
|