merge previously added changes
This commit is contained in:
commit
83e0f75339
|
|
@ -29,8 +29,8 @@ dependencies:
|
||||||
condition: ray-cluster.enabled
|
condition: ray-cluster.enabled
|
||||||
version: "1.3.0"
|
version: "1.3.0"
|
||||||
repository: "https://ray-project.github.io/kuberay-helm"
|
repository: "https://ray-project.github.io/kuberay-helm"
|
||||||
- condition: ray-cluster.enabled
|
- name: ray-cluster
|
||||||
name: ray-cluster
|
condition: ray-cluster.enabled
|
||||||
version: "1.3.0"
|
version: "1.3.0"
|
||||||
repository: "https://ray-project.github.io/kuberay-helm"
|
repository: "https://ray-project.github.io/kuberay-helm"
|
||||||
- name: jupyterhub
|
- name: jupyterhub
|
||||||
|
|
@ -39,3 +39,7 @@ dependencies:
|
||||||
- name: mlflow
|
- name: mlflow
|
||||||
version: "0.12.0"
|
version: "0.12.0"
|
||||||
repository: "https://community-charts.github.io/helm-charts"
|
repository: "https://community-charts.github.io/helm-charts"
|
||||||
|
- name: ollama
|
||||||
|
condition: ollama.enabled
|
||||||
|
version: "1.27.0"
|
||||||
|
repository: "https://helm.otwld.com"
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -5,3 +5,5 @@ huggingface_hub
|
||||||
numpy
|
numpy
|
||||||
ipywidgets
|
ipywidgets
|
||||||
mlflow
|
mlflow
|
||||||
|
ollama
|
||||||
|
panel
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,9 @@ jupyterhub:
|
||||||
- name: welcome-ipynb
|
- name: welcome-ipynb
|
||||||
configMap:
|
configMap:
|
||||||
name: "ai-starter-kit-welcome-ipynb"
|
name: "ai-starter-kit-welcome-ipynb"
|
||||||
|
- name: ray-ipynb
|
||||||
|
configMap:
|
||||||
|
name: "ai-starter-kit-ray-ipynb"
|
||||||
- name: chat-bot-ipynb
|
- name: chat-bot-ipynb
|
||||||
configMap:
|
configMap:
|
||||||
name: "ai-starter-kit-chat-bot-ipynb"
|
name: "ai-starter-kit-chat-bot-ipynb"
|
||||||
|
|
@ -91,6 +94,9 @@ jupyterhub:
|
||||||
- name: download-models-py
|
- name: download-models-py
|
||||||
mountPath: /tmp/download_models.py
|
mountPath: /tmp/download_models.py
|
||||||
subPath: download_models.py
|
subPath: download_models.py
|
||||||
|
- name: ray-ipynb
|
||||||
|
mountPath: /tmp/ray.ipynb
|
||||||
|
subPath: ray.ipynb
|
||||||
- name: chat-bot-ipynb
|
- name: chat-bot-ipynb
|
||||||
mountPath: /tmp/chat_bot.ipynb
|
mountPath: /tmp/chat_bot.ipynb
|
||||||
subPath: chat_bot.ipynb
|
subPath: chat_bot.ipynb
|
||||||
|
|
@ -102,8 +108,8 @@ jupyterhub:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: ai-starter-kit-hf-token-secret
|
name: ai-starter-kit-hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
|
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
|
||||||
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
|
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
|
||||||
cloudMetadata:
|
cloudMetadata:
|
||||||
# Without this disabled, the GKE Autopilot Warden will raise an error about container with escalated privilieges
|
# Without this disabled, the GKE Autopilot Warden will raise an error about container with escalated privilieges
|
||||||
blockWithIptables: false
|
blockWithIptables: false
|
||||||
|
|
@ -138,12 +144,12 @@ ray-cluster:
|
||||||
serviceType: ClusterIP
|
serviceType: ClusterIP
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "1"
|
cpu: "4"
|
||||||
memory: "2G"
|
memory: "4G"
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
limits:
|
limits:
|
||||||
cpu: "4"
|
cpu: "8"
|
||||||
memory: "8G"
|
memory: "6G"
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
volumes:
|
volumes:
|
||||||
- name: ray-pvc-storage
|
- name: ray-pvc-storage
|
||||||
|
|
@ -157,12 +163,12 @@ ray-cluster:
|
||||||
worker:
|
worker:
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: "1"
|
cpu: "4"
|
||||||
memory: "2G"
|
memory: "4G"
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
limits:
|
limits:
|
||||||
cpu: "4"
|
cpu: "8"
|
||||||
memory: "8G"
|
memory: "6G"
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
volumes:
|
volumes:
|
||||||
- name: ray-pvc-storage
|
- name: ray-pvc-storage
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ jupyterhub:
|
||||||
|
|
||||||
python /tmp/download_models.py
|
python /tmp/download_models.py
|
||||||
|
|
||||||
|
# populate workspace with initial files
|
||||||
if [ ! -f /home/jovyan/welcome.ipynb ]; then
|
if [ ! -f /home/jovyan/welcome.ipynb ]; then
|
||||||
cp /tmp/welcome.ipynb /home/jovyan/welcome.ipynb
|
cp /tmp/welcome.ipynb /home/jovyan/welcome.ipynb
|
||||||
fi
|
fi
|
||||||
|
|
@ -46,10 +47,12 @@ jupyterhub:
|
||||||
mountPath: /tmp/requirements.txt
|
mountPath: /tmp/requirements.txt
|
||||||
subPath: requirements.txt
|
subPath: requirements.txt
|
||||||
readOnly: true
|
readOnly: true
|
||||||
|
# This 'home' volume is created by the helm chart's 'homeMountPath' option.
|
||||||
|
# We mount it to initContainer too, so all downloads and installations are persisted in this mounted home folder.
|
||||||
- name: home
|
- name: home
|
||||||
mountPath: /home/jovyan
|
mountPath: /home/jovyan
|
||||||
subPath: jupyterhub_workspace
|
subPath: jupyterhub_workspace
|
||||||
- name: "download-models-py"
|
- name: "download-models-py"
|
||||||
mountPath: /tmp/download_models.py
|
mountPath: /tmp/download_models.py
|
||||||
subPath: download_models.py
|
subPath: download_models.py
|
||||||
readOnly: true
|
readOnly: true
|
||||||
|
|
@ -78,6 +81,12 @@ jupyterhub:
|
||||||
- name: welcome-ipynb
|
- name: welcome-ipynb
|
||||||
configMap:
|
configMap:
|
||||||
name: "ai-starter-kit-welcome-ipynb"
|
name: "ai-starter-kit-welcome-ipynb"
|
||||||
|
- name: ray-ipynb
|
||||||
|
configMap:
|
||||||
|
name: "ai-starter-kit-ray-ipynb"
|
||||||
|
- name: chat-bot-ipynb
|
||||||
|
configMap:
|
||||||
|
name: "ai-starter-kit-chat-bot-ipynb"
|
||||||
extraVolumeMounts:
|
extraVolumeMounts:
|
||||||
- name: requirements-txt
|
- name: requirements-txt
|
||||||
mountPath: /tmp/requirements.txt
|
mountPath: /tmp/requirements.txt
|
||||||
|
|
@ -85,16 +94,22 @@ jupyterhub:
|
||||||
- name: download-models-py
|
- name: download-models-py
|
||||||
mountPath: /tmp/download_models.py
|
mountPath: /tmp/download_models.py
|
||||||
subPath: download_models.py
|
subPath: download_models.py
|
||||||
|
- name: ray-ipynb
|
||||||
|
mountPath: /tmp/ray.ipynb
|
||||||
|
subPath: ray.ipynb
|
||||||
|
- name: chat-bot-ipynb
|
||||||
|
mountPath: /tmp/chat_bot.ipynb
|
||||||
|
subPath: chat_bot.ipynb
|
||||||
# This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
|
# This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
|
||||||
extraEnv:
|
extraEnv:
|
||||||
HF_TOKEN:
|
HF_TOKEN:
|
||||||
name: HF_TOKEN
|
name: HF_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: ai-starter-kit-hf-token-secret
|
name: ai-starter-kit-hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
|
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
|
||||||
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
|
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
|
||||||
hub:
|
hub:
|
||||||
db:
|
db:
|
||||||
type: sqlite-pvc
|
type: sqlite-pvc
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,12 @@ jupyterhub:
|
||||||
- name: welcome-ipynb
|
- name: welcome-ipynb
|
||||||
configMap:
|
configMap:
|
||||||
name: "ai-starter-kit-welcome-ipynb"
|
name: "ai-starter-kit-welcome-ipynb"
|
||||||
|
- name: ray-ipynb
|
||||||
|
configMap:
|
||||||
|
name: "ai-starter-kit-ray-ipynb"
|
||||||
|
- name: chat-bot-ipynb
|
||||||
|
configMap:
|
||||||
|
name: "ai-starter-kit-chat-bot-ipynb"
|
||||||
extraVolumeMounts:
|
extraVolumeMounts:
|
||||||
- name: requirements-txt
|
- name: requirements-txt
|
||||||
mountPath: /tmp/requirements.txt
|
mountPath: /tmp/requirements.txt
|
||||||
|
|
@ -88,6 +94,12 @@ jupyterhub:
|
||||||
- name: download-models-py
|
- name: download-models-py
|
||||||
mountPath: /tmp/download_models.py
|
mountPath: /tmp/download_models.py
|
||||||
subPath: download_models.py
|
subPath: download_models.py
|
||||||
|
- name: ray-ipynb
|
||||||
|
mountPath: /tmp/ray.ipynb
|
||||||
|
subPath: ray.ipynb
|
||||||
|
- name: chat-bot-ipynb
|
||||||
|
mountPath: /tmp/chat_bot.ipynb
|
||||||
|
subPath: chat_bot.ipynb
|
||||||
# This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
|
# This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
|
||||||
extraEnv:
|
extraEnv:
|
||||||
HF_TOKEN:
|
HF_TOKEN:
|
||||||
|
|
@ -96,8 +108,8 @@ jupyterhub:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: ai-starter-kit-hf-token-secret
|
name: ai-starter-kit-hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
|
RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
|
||||||
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
|
MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
|
||||||
hub:
|
hub:
|
||||||
db:
|
db:
|
||||||
type: sqlite-pvc
|
type: sqlite-pvc
|
||||||
|
|
@ -162,6 +174,18 @@ localPersistence:
|
||||||
# This path must match the destination path inside the minikube node.
|
# This path must match the destination path inside the minikube node.
|
||||||
hostPath: "/tmp/models-cache"
|
hostPath: "/tmp/models-cache"
|
||||||
|
|
||||||
|
ollama:
|
||||||
|
enabled: true
|
||||||
|
ollama:
|
||||||
|
models:
|
||||||
|
pull:
|
||||||
|
- gemma3
|
||||||
|
persistentVolume:
|
||||||
|
enabled: true
|
||||||
|
existingClaim: "ai-starter-kit-models-cache-pvc"
|
||||||
|
subPath: "ollama"
|
||||||
|
|
||||||
|
|
||||||
ramalama:
|
ramalama:
|
||||||
enabled: true
|
enabled: true
|
||||||
command: ["sh", "-c" , "trap 'exit 0' TERM; while true; do sleep 60 & wait; done"]
|
command: ["sh", "-c" , "trap 'exit 0' TERM; while true; do sleep 60 & wait; done"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,621 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "079fadd2-200e-4d37-8ae2-be2792e3a24e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 1 - Initialize Ray endpoints and verify dashboard\n",
|
||||||
|
"\n",
|
||||||
|
"Installs requests, derives the Ray head host from RAY_ADDRESS, builds Dashboard/Serve/MLflow URLs, reads an Hugging Face token, and prints the endpoints plus the Jobs API version for a quick health check."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip -q install requests==2.* --disable-pip-version-check\n",
|
||||||
|
"\n",
|
||||||
|
"import os, textwrap, base64, time, json, requests\n",
|
||||||
|
"from string import Template\n",
|
||||||
|
"\n",
|
||||||
|
"raw_addr = os.getenv(\"RAY_ADDRESS\", \"ray://ai-starter-kit-kuberay-head-svc:10001\")\n",
|
||||||
|
"if raw_addr.startswith(\"ray://\"):\n",
|
||||||
|
" HEAD_HOST = raw_addr.split(\"://\", 1)[1].split(\":\", 1)[0]\n",
|
||||||
|
"else:\n",
|
||||||
|
" HEAD_HOST = raw_addr.split(\":\", 1)[0] or \"ai-starter-kit-kuberay-head-svc\"\n",
|
||||||
|
"\n",
|
||||||
|
"DASH_URL = f\"http://{HEAD_HOST}:8265\"\n",
|
||||||
|
"SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"8000\"))\n",
|
||||||
|
"SERVE_ROUTE = \"/v1\"\n",
|
||||||
|
"\n",
|
||||||
|
"HF_TOKEN_PATH = \"/etc/secrets/huggingface/token\"\n",
|
||||||
|
"HF_TOKEN = \"\"\n",
|
||||||
|
"if os.path.exists(HF_TOKEN_PATH):\n",
|
||||||
|
" try:\n",
|
||||||
|
" HF_TOKEN = open(HF_TOKEN_PATH).read().strip()\n",
|
||||||
|
" except Exception:\n",
|
||||||
|
" HF_TOKEN = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Head host:\", HEAD_HOST)\n",
|
||||||
|
"print(\"Jobs API :\", f\"{DASH_URL}/api/jobs/\")\n",
|
||||||
|
"print(\"Serve URL:\", f\"http://{HEAD_HOST}:{SERVE_PORT}{SERVE_ROUTE}\")\n",
|
||||||
|
"print(\"MLflow :\", os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\"))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Jobs API version:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe862173-fd9a-41ae-a27b-63875f788024",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 2 - Deploy a minimal Ray Serve smoke test and verify readiness\n",
|
||||||
|
"\n",
|
||||||
|
"Submits a tiny FastAPI app to Ray Serve (one /healthz endpoint under /smoke) as a Ray Job, installing FastAPI on the fly. It polls the Jobs API for status and hits :8000/smoke/healthz up to 60 seconds, printing when the service responds 200 (i.e., smoke test passes)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "34da3e26-6276-48b7-b3ac-c90359df6547",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os, base64, textwrap, time, requests\n",
|
||||||
|
"\n",
|
||||||
|
"DASH_URL = \"http://ai-starter-kit-kuberay-head-svc:8265\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Jobs API:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n",
|
||||||
|
"\n",
|
||||||
|
"serve_py = textwrap.dedent(\"\"\"\n",
|
||||||
|
" from fastapi import FastAPI\n",
|
||||||
|
" from ray import serve\n",
|
||||||
|
" serve.start(detached=True, http_options={\"host\":\"0.0.0.0\",\"port\":8000})\n",
|
||||||
|
" app = FastAPI()\n",
|
||||||
|
"\n",
|
||||||
|
" @serve.deployment(name=\"smoke\", num_replicas=1)\n",
|
||||||
|
" @serve.ingress(app)\n",
|
||||||
|
" class Smoke:\n",
|
||||||
|
" @app.get(\"/healthz\")\n",
|
||||||
|
" async def health(self): return {\"ok\": True}\n",
|
||||||
|
"\n",
|
||||||
|
" serve.run(Smoke.bind(), route_prefix=\"/smoke\")\n",
|
||||||
|
" print(\"READY: smoke\", flush=True)\n",
|
||||||
|
"\"\"\").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"b64 = base64.b64encode(serve_py.encode()).decode()\n",
|
||||||
|
"entry = f'python -c \"import base64; exec(base64.b64decode(\\'{b64}\\'))\"'\n",
|
||||||
|
"submit = requests.post(f\"{DASH_URL}/api/jobs/\", json={\"entrypoint\": entry, \"runtime_env\": {\"pip\": [\"fastapi>=0.110\"]}}, timeout=60).json()\n",
|
||||||
|
"job_id = submit[\"job_id\"]\n",
|
||||||
|
"print(\"Job:\", job_id)\n",
|
||||||
|
"\n",
|
||||||
|
"svc = \"http://ai-starter-kit-kuberay-head-svc:8000/smoke/healthz\"\n",
|
||||||
|
"for i in range(60):\n",
|
||||||
|
" s = requests.get(f\"{DASH_URL}/api/jobs/{job_id}\", timeout=10).json()[\"status\"]\n",
|
||||||
|
" try:\n",
|
||||||
|
" r = requests.get(svc, timeout=2)\n",
|
||||||
|
" print(f\"tick {i:02d}: job={s}, health={r.status_code}\")\n",
|
||||||
|
" if r.status_code == 200:\n",
|
||||||
|
" print(\"Smoke OK\")\n",
|
||||||
|
" break\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"tick {i:02d}: job={s}, health=ERR {e}\")\n",
|
||||||
|
" time.sleep(1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8111d705-595e-4e65-8479-bdc76191fa31",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 3 - Deploy model on Ray Serve with llama-cpp\n",
|
||||||
|
"\n",
|
||||||
|
"Packages and submits a Ray Job that spins up a Ray Serve app exposing /v1/healthz and /v1/chat/completions. It downloads the preferred GGUF from Hugging Face, initializes llama-cpp-python, logs to MLflow, and prints the deployed health/chat URLs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bbea1539-e9ab-460a-9cfc-20a42807f616",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os, base64, textwrap, requests\n",
|
||||||
|
"\n",
|
||||||
|
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
|
||||||
|
"DASH_URL = f\"http://{HEAD}:8265\"\n",
|
||||||
|
"SERVE_PORT = 8000\n",
|
||||||
|
"SERVE_ROUTE = \"/v1\"\n",
|
||||||
|
"\n",
|
||||||
|
"runtime_env = {\n",
|
||||||
|
" \"pip\": [\n",
|
||||||
|
" \"fastapi==0.110.0\",\n",
|
||||||
|
" \"uvicorn==0.23.2\",\n",
|
||||||
|
" \"huggingface_hub==0.25.2\",\n",
|
||||||
|
" \"llama-cpp-python==0.3.16\", \n",
|
||||||
|
" \"hf_transfer==0.1.6\",\n",
|
||||||
|
" \"mlflow==2.14.3\", \n",
|
||||||
|
" ],\n",
|
||||||
|
" \"env_vars\": {\n",
|
||||||
|
" \"HF_HUB_ENABLE_HF_TRANSFER\": \"1\",\n",
|
||||||
|
" \"HUGGINGFACE_HUB_TOKEN\": os.environ.get(\"HUGGINGFACE_HUB_TOKEN\", \"\"),\n",
|
||||||
|
" \"SERVE_PORT\": str(SERVE_PORT),\n",
|
||||||
|
"\n",
|
||||||
|
" \"MODEL_REPO\": \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\",\n",
|
||||||
|
" \"GGUF_PREF_ORDER\": \"q4_k_m,q4_0,q3_k_m,q2_k\",\n",
|
||||||
|
"\n",
|
||||||
|
" \"LLM_CONTEXT\": os.environ.get(\"LLM_CONTEXT\", \"1024\"),\n",
|
||||||
|
" \"LLM_MAX_TOKENS\": os.environ.get(\"LLM_MAX_TOKENS\", \"256\"),\n",
|
||||||
|
" \"SERVER_MAX_NEW_TOKENS\": os.environ.get(\"SERVER_MAX_NEW_TOKENS\", \"512\"),\n",
|
||||||
|
"\n",
|
||||||
|
" \"LLM_THREADS\": os.environ.get(\"LLM_THREADS\", \"6\"),\n",
|
||||||
|
" \"OMP_NUM_THREADS\": os.environ.get(\"OMP_NUM_THREADS\", \"6\"),\n",
|
||||||
|
" \"GPU_LAYERS\": \"0\", \n",
|
||||||
|
" \n",
|
||||||
|
" \"PIP_PREFER_BINARY\": \"1\",\n",
|
||||||
|
" \"CMAKE_ARGS\": \"-DGGML_OPENMP=OFF -DLLAMA_NATIVE=OFF\",\n",
|
||||||
|
"\n",
|
||||||
|
" \"HF_HOME\": \"/tmp/hf-cache\",\n",
|
||||||
|
" \"TRANSFORMERS_CACHE\": \"/tmp/hf-cache\",\n",
|
||||||
|
"\n",
|
||||||
|
" \"MLFLOW_TRACKING_URI\": os.environ.get(\"MLFLOW_TRACKING_URI\", \"\"),\n",
|
||||||
|
" \"MLFLOW_EXPERIMENT_NAME\": os.environ.get(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\"),\n",
|
||||||
|
" },\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"serve_py = textwrap.dedent(f\"\"\"\n",
|
||||||
|
"import os, time, multiprocessing, uuid\n",
|
||||||
|
"from typing import List, Dict, Any\n",
|
||||||
|
"from fastapi import FastAPI, Request\n",
|
||||||
|
"from fastapi.responses import JSONResponse\n",
|
||||||
|
"from huggingface_hub import HfApi, hf_hub_download\n",
|
||||||
|
"from ray import serve\n",
|
||||||
|
"from llama_cpp import Llama\n",
|
||||||
|
"\n",
|
||||||
|
"USE_MLFLOW = False\n",
|
||||||
|
"try:\n",
|
||||||
|
" import mlflow\n",
|
||||||
|
" if os.getenv(\"MLFLOW_TRACKING_URI\"):\n",
|
||||||
|
" mlflow.set_tracking_uri(os.getenv(\"MLFLOW_TRACKING_URI\"))\n",
|
||||||
|
" mlflow.set_experiment(os.getenv(\"MLFLOW_EXPERIMENT_NAME\",\"ray-llama-cpp\"))\n",
|
||||||
|
" USE_MLFLOW = True\n",
|
||||||
|
"except Exception as _e:\n",
|
||||||
|
" USE_MLFLOW = False\n",
|
||||||
|
"\n",
|
||||||
|
"SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"{SERVE_PORT}\"))\n",
|
||||||
|
"SERVE_ROUTE = \"{SERVE_ROUTE}\"\n",
|
||||||
|
"MODEL_REPO = os.getenv(\"MODEL_REPO\", \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\")\n",
|
||||||
|
"GGUF_PREFS = [s.strip() for s in os.getenv(\"GGUF_PREF_ORDER\",\"q4_k_m,q4_0,q3_k_m,q2_k\").split(\",\") if s.strip()]\n",
|
||||||
|
"CTX_LEN = int(os.getenv(\"LLM_CONTEXT\", \"2048\"))\n",
|
||||||
|
"MAX_TOKENS = int(os.getenv(\"LLM_MAX_TOKENS\", \"256\"))\n",
|
||||||
|
"HF_TOKEN = os.getenv(\"HUGGINGFACE_HUB_TOKEN\") or None\n",
|
||||||
|
"\n",
|
||||||
|
"serve.start(detached=True, http_options={{\"host\":\"0.0.0.0\", \"port\":SERVE_PORT}})\n",
|
||||||
|
"app = FastAPI()\n",
|
||||||
|
"\n",
|
||||||
|
"def pick_one_file(repo_id: str, prefs):\n",
|
||||||
|
" api = HfApi()\n",
|
||||||
|
" files = api.list_repo_files(repo_id=repo_id, repo_type=\"model\", token=HF_TOKEN)\n",
|
||||||
|
" ggufs = [f for f in files if f.lower().endswith(\".gguf\")]\n",
|
||||||
|
" if not ggufs:\n",
|
||||||
|
" raise RuntimeError(f\"No .gguf files visible in {{repo_id}}\")\n",
|
||||||
|
" for pref in prefs:\n",
|
||||||
|
" for f in ggufs:\n",
|
||||||
|
" if pref.lower() in f.lower():\n",
|
||||||
|
" return f\n",
|
||||||
|
" return ggufs[0]\n",
|
||||||
|
"\n",
|
||||||
|
"def pick_chat_format(repo: str, fname: str) -> str:\n",
|
||||||
|
" return \"qwen\"\n",
|
||||||
|
"\n",
|
||||||
|
"@serve.deployment(name=\"qwen\", num_replicas=1, ray_actor_options={{\"num_cpus\": 6}})\n",
|
||||||
|
"@serve.ingress(app)\n",
|
||||||
|
"class OpenAICompatLlama:\n",
|
||||||
|
" def __init__(self, repo_id: str = MODEL_REPO):\n",
|
||||||
|
" target = pick_one_file(repo_id, GGUF_PREFS)\n",
|
||||||
|
" print(f\"[env] model repo: {{repo_id}} file: {{target}}\", flush=True)\n",
|
||||||
|
" local_dir = \"/tmp/hf-gguf\"; os.makedirs(local_dir, exist_ok=True)\n",
|
||||||
|
"\n",
|
||||||
|
" gguf_path = hf_hub_download(\n",
|
||||||
|
" repo_id=repo_id, filename=target, token=HF_TOKEN,\n",
|
||||||
|
" local_dir=local_dir, local_dir_use_symlinks=False,\n",
|
||||||
|
" force_download=False, resume_download=True\n",
|
||||||
|
" )\n",
|
||||||
|
" print(f\"[download] done: {{gguf_path}}\", flush=True)\n",
|
||||||
|
"\n",
|
||||||
|
" n_threads = int(os.getenv(\"LLM_THREADS\", max(2, (multiprocessing.cpu_count() or 4)//2)))\n",
|
||||||
|
" print(f\"[load] llama-cpp-python | ctx={{CTX_LEN}} threads={{n_threads}} gpu_layers={{int(os.getenv('GPU_LAYERS','0'))}}\", flush=True)\n",
|
||||||
|
"\n",
|
||||||
|
" self.model_file = os.path.basename(gguf_path)\n",
|
||||||
|
" self.model_repo = repo_id\n",
|
||||||
|
" chat_format = pick_chat_format(self.model_repo, self.model_file)\n",
|
||||||
|
" print(f\"[load] chat_format={{chat_format}}\", flush=True)\n",
|
||||||
|
"\n",
|
||||||
|
" self.llm = Llama(\n",
|
||||||
|
" model_path=gguf_path,\n",
|
||||||
|
" n_ctx=CTX_LEN,\n",
|
||||||
|
" n_threads=n_threads,\n",
|
||||||
|
" n_batch=256, \n",
|
||||||
|
" n_gpu_layers=int(os.getenv(\"GPU_LAYERS\",\"0\")),\n",
|
||||||
|
" chat_format=chat_format,\n",
|
||||||
|
" verbose=False\n",
|
||||||
|
" )\n",
|
||||||
|
" print(\"[ready] model loaded\", flush=True)\n",
|
||||||
|
"\n",
|
||||||
|
" @app.get(\"/healthz\")\n",
|
||||||
|
" async def health(self):\n",
|
||||||
|
" return {{\"status\":\"ok\"}}\n",
|
||||||
|
"\n",
|
||||||
|
" @app.post(\"/chat/completions\")\n",
|
||||||
|
" async def chat_completions(self, request: Request):\n",
|
||||||
|
" t0 = time.time()\n",
|
||||||
|
" body = await request.json()\n",
|
||||||
|
"\n",
|
||||||
|
" messages = body.get(\"messages\", [])\n",
|
||||||
|
" temperature = float(body.get(\"temperature\", 0.2))\n",
|
||||||
|
" req_max = body.get(\"max_tokens\", None)\n",
|
||||||
|
" stop_words = (body.get(\"stop\", []) or []) + [\"<|im_end|>\", \"</s>\"]\n",
|
||||||
|
"\n",
|
||||||
|
" SERVER_MAX = int(os.getenv(\"SERVER_MAX_NEW_TOKENS\", \"512\"))\n",
|
||||||
|
" max_tokens = int(req_max if isinstance(req_max, int) else MAX_TOKENS)\n",
|
||||||
|
" max_tokens = max(32, min(max_tokens, CTX_LEN - 128, SERVER_MAX))\n",
|
||||||
|
"\n",
|
||||||
|
" rid = \"chatcmpl-\" + uuid.uuid4().hex[:24]\n",
|
||||||
|
" created = int(time.time())\n",
|
||||||
|
" model_name = f\"{{self.model_repo}}/{{self.model_file}}\"\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" result = self.llm.create_chat_completion(\n",
|
||||||
|
" messages=messages,\n",
|
||||||
|
" temperature=temperature,\n",
|
||||||
|
" max_tokens=max_tokens,\n",
|
||||||
|
" top_k=50,\n",
|
||||||
|
" top_p=0.9,\n",
|
||||||
|
" repeat_penalty=1.1,\n",
|
||||||
|
" stop=stop_words,\n",
|
||||||
|
" )\n",
|
||||||
|
" out_text = (result[\"choices\"][0][\"message\"][\"content\"] or \"\").strip()\n",
|
||||||
|
" usage_raw = result.get(\"usage\") or {{}}\n",
|
||||||
|
" p_tokens = int(usage_raw.get(\"prompt_tokens\") or 0)\n",
|
||||||
|
" c_tokens = int(usage_raw.get(\"completion_tokens\") or 0)\n",
|
||||||
|
" err = None\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" out_text = \"\"\n",
|
||||||
|
" p_tokens = c_tokens = 0\n",
|
||||||
|
" err = str(e)\n",
|
||||||
|
"\n",
|
||||||
|
" if USE_MLFLOW:\n",
|
||||||
|
" try:\n",
|
||||||
|
" dur_ms = int((time.time()-t0) * 1000)\n",
|
||||||
|
" with mlflow.start_run(run_name=\"chat\"):\n",
|
||||||
|
" mlflow.set_tags({{\n",
|
||||||
|
" \"model_repo\": self.model_repo,\n",
|
||||||
|
" \"model_file\": self.model_file,\n",
|
||||||
|
" \"framework\": \"llama-cpp-python\",\n",
|
||||||
|
" }})\n",
|
||||||
|
" mlflow.log_params({{\n",
|
||||||
|
" \"temperature\": temperature,\n",
|
||||||
|
" \"max_tokens\": max_tokens,\n",
|
||||||
|
" \"ctx\": CTX_LEN,\n",
|
||||||
|
" }})\n",
|
||||||
|
" if not (p_tokens and c_tokens):\n",
|
||||||
|
" p_tokens = p_tokens or max(1, len(\" \".join(m.get(\"content\",\"\") for m in messages).split()))\n",
|
||||||
|
" c_tokens = c_tokens or max(0, len(out_text.split()))\n",
|
||||||
|
" mlflow.log_metrics({{\n",
|
||||||
|
" \"duration_ms\": dur_ms,\n",
|
||||||
|
" \"prompt_tokens_approx\": p_tokens,\n",
|
||||||
|
" \"completion_tokens_approx\": c_tokens,\n",
|
||||||
|
" \"total_tokens_approx\": p_tokens + c_tokens,\n",
|
||||||
|
" }})\n",
|
||||||
|
" except Exception:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
" if err:\n",
|
||||||
|
" return JSONResponse(status_code=500, content={{\"error\": err, \"type\":\"generation_error\"}})\n",
|
||||||
|
"\n",
|
||||||
|
" usage = {{\n",
|
||||||
|
" \"prompt_tokens\": p_tokens,\n",
|
||||||
|
" \"completion_tokens\": c_tokens,\n",
|
||||||
|
" \"total_tokens\": p_tokens + c_tokens,\n",
|
||||||
|
" }}\n",
|
||||||
|
" return {{\n",
|
||||||
|
" \"id\": rid,\n",
|
||||||
|
" \"object\": \"chat.completion\",\n",
|
||||||
|
" \"created\": created,\n",
|
||||||
|
" \"model\": model_name,\n",
|
||||||
|
" \"choices\": [\n",
|
||||||
|
" {{\n",
|
||||||
|
" \"index\": 0,\n",
|
||||||
|
" \"message\": {{\"role\":\"assistant\",\"content\": out_text}},\n",
|
||||||
|
" \"finish_reason\": \"stop\"\n",
|
||||||
|
" }}\n",
|
||||||
|
" ],\n",
|
||||||
|
" \"usage\": usage\n",
|
||||||
|
" }}\n",
|
||||||
|
"\n",
|
||||||
|
"serve.run(OpenAICompatLlama.bind(), route_prefix=SERVE_ROUTE)\n",
|
||||||
|
"print(\"READY\", flush=True)\n",
|
||||||
|
"\"\"\").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"payload = base64.b64encode(serve_py.encode()).decode()\n",
|
||||||
|
"entrypoint = 'python -c \"import base64,sys;exec(base64.b64decode(\\'{}\\').decode())\"'.format(payload)\n",
|
||||||
|
"\n",
|
||||||
|
"job = requests.post(\n",
|
||||||
|
" f\"{DASH_URL}/api/jobs/\",\n",
|
||||||
|
" json={\n",
|
||||||
|
" \"entrypoint\": entrypoint,\n",
|
||||||
|
" \"runtime_env\": runtime_env,\n",
|
||||||
|
" \"metadata\": {\"job_name\": \"serve-qwen2_5-llama_cpp-openai\"},\n",
|
||||||
|
" },\n",
|
||||||
|
" timeout=45\n",
|
||||||
|
").json()\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Job:\", job.get(\"job_id\"))\n",
|
||||||
|
"print(\"Health:\", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/healthz\")\n",
|
||||||
|
"print(\"Chat: \", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/chat/completions\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a411c015-c802-4ca1-81bb-3f4790d9626a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 4 - Basic client + latency test\n",
|
||||||
|
"\n",
|
||||||
|
"Calls /v1/healthz and then sends an OpenAI-style chat request to /v1/chat/completions with a short prompt. Prints latency and token usage, returning the assistant text."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3be634e2-a82f-42c9-8e31-57e6868a86ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os, time, requests, json\n",
|
||||||
|
"\n",
|
||||||
|
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
|
||||||
|
"SERVE_PORT = 8000\n",
|
||||||
|
"BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\"\n",
|
||||||
|
"\n",
|
||||||
|
"def health():\n",
|
||||||
|
" r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n",
|
||||||
|
" print(\"Health:\", r.status_code, r.json())\n",
|
||||||
|
"\n",
|
||||||
|
"def chat(prompt, temperature=0.4, max_tokens=220, stop=None):\n",
|
||||||
|
" body = {\n",
|
||||||
|
" \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n",
|
||||||
|
" \"temperature\": float(temperature),\n",
|
||||||
|
" \"max_tokens\": int(max_tokens),\n",
|
||||||
|
" \"messages\": [\n",
|
||||||
|
" {\"role\": \"system\", \"content\": \"You are Qwen2.5 Instruct running on a tiny CPU host. Be concise, complete sentences.\"},\n",
|
||||||
|
" {\"role\": \"user\", \"content\": prompt},\n",
|
||||||
|
" ],\n",
|
||||||
|
" }\n",
|
||||||
|
" if stop:\n",
|
||||||
|
" body[\"stop\"] = stop\n",
|
||||||
|
"\n",
|
||||||
|
" t0 = time.time()\n",
|
||||||
|
" r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=300)\n",
|
||||||
|
" dt = time.time() - t0\n",
|
||||||
|
" r.raise_for_status()\n",
|
||||||
|
" out = r.json()[\"choices\"][0][\"message\"][\"content\"]\n",
|
||||||
|
" usage = r.json().get(\"usage\", {})\n",
|
||||||
|
" print(f\"\\nLatency: {dt:.2f}s | usage: {usage}\")\n",
|
||||||
|
" print(\"\\n---\\n\", out)\n",
|
||||||
|
" return out\n",
|
||||||
|
"\n",
|
||||||
|
"health()\n",
|
||||||
|
"_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\", stop=[\"<|im_end|>\"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "553d2756-8949-43e3-8342-71387688e0fa",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 5 - Multi-agent (Autogen) pipeline\n",
|
||||||
|
"\n",
|
||||||
|
"Installs Autogen, configures OpenAIWrapper to hit Ray Serve /v1 endpoint, warms up the model, then runs a simple three-agent workflow (Researcher -> Writer -> Critic) to produce and refine a short report."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip -q install pyautogen~=0.2.35 \"flaml[automl]\" --disable-pip-version-check\n",
|
||||||
|
"\n",
|
||||||
|
"import os, sys\n",
|
||||||
|
"\n",
|
||||||
|
"for p in [\n",
|
||||||
|
" \"/tmp/models-cache/lib/python3.11/site-packages\", \n",
|
||||||
|
" os.path.expanduser(\"~/.local/lib/python3.11/site-packages\"), \n",
|
||||||
|
"]:\n",
|
||||||
|
" if os.path.isdir(p) and p not in sys.path:\n",
|
||||||
|
" sys.path.insert(0, p)\n",
|
||||||
|
"\n",
|
||||||
|
"import os, autogen\n",
|
||||||
|
"from autogen import AssistantAgent, UserProxyAgent\n",
|
||||||
|
"\n",
|
||||||
|
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
|
||||||
|
"SERVE_PORT = 8000\n",
|
||||||
|
"BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\" \n",
|
||||||
|
"\n",
|
||||||
|
"config_list = [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"model\": \"qwen2.5-1.5b-instruct-gguf\", \n",
|
||||||
|
" \"base_url\": BASE_URL, \n",
|
||||||
|
" \"api_key\": \"local\", \n",
|
||||||
|
" \"price\": [0.0, 0.0],\n",
|
||||||
|
" }\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"llm = autogen.OpenAIWrapper(config_list=config_list)\n",
|
||||||
|
"try:\n",
|
||||||
|
" r = llm.create(messages=[{\"role\":\"user\",\"content\":\"Say 'test ok'.\"}], temperature=0.2, max_tokens=16)\n",
|
||||||
|
" print(\"Warmup:\", r.choices[0].message.content)\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(\"Warmup failed:\", e)\n",
|
||||||
|
"\n",
|
||||||
|
"user_proxy = UserProxyAgent(\n",
|
||||||
|
" name=\"UserProxy\",\n",
|
||||||
|
" system_message=\"You are the human admin. Initiate the task.\",\n",
|
||||||
|
" code_execution_config=False,\n",
|
||||||
|
" human_input_mode=\"NEVER\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"researcher = AssistantAgent(\n",
|
||||||
|
" name=\"Researcher\",\n",
|
||||||
|
" system_message=(\n",
|
||||||
|
" \"You are a researcher. Gather concise, verified facts on the topic. \"\n",
|
||||||
|
" \"Return several bullet points with inline source domains (e.g., nature.com, ibm.com). \"\n",
|
||||||
|
" \"Keep under 100 words total. No made-up sources. \"\n",
|
||||||
|
" \"Do not include any special end token.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
" llm_config={\"config_list\": config_list, \"temperature\": 0.35, \"max_tokens\": 140, \"timeout\": 300},\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"writer = AssistantAgent(\n",
|
||||||
|
" name=\"Writer\",\n",
|
||||||
|
" system_message=(\n",
|
||||||
|
" \"You are a writer. Using the Researcher’s notes, produce a clear word report under 160 words. \"\n",
|
||||||
|
" \"Avoid speculation. Keep it structured and readable. \"\n",
|
||||||
|
" \"Do not include any special end token.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
" llm_config={\"config_list\": config_list, \"temperature\": 0.55, \"max_tokens\": 220, \"timeout\": 180},\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"critic = AssistantAgent(\n",
|
||||||
|
" name=\"Critic\",\n",
|
||||||
|
" system_message=(\n",
|
||||||
|
" \"You are a critic. Review the Writer’s report for accuracy, clarity, and flow.\"\n",
|
||||||
|
" \"Present the tightened final text and keep it under 140 words. On a new last line output exactly: <|END|>\"\n",
|
||||||
|
" ),\n",
|
||||||
|
" llm_config={\"config_list\": config_list, \"temperature\": 0.45, \"max_tokens\": 160, \"timeout\": 300},\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"def run_sequential(task):\n",
|
||||||
|
" research_response = researcher.generate_reply(messages=[{\"content\": task, \"role\": \"user\"}])\n",
|
||||||
|
" research_notes = research_response if isinstance(research_response, str) else research_response.get(\"content\", \"[no output]\")\n",
|
||||||
|
" print(\"\\nResearch Notes:\\n\", research_notes)\n",
|
||||||
|
"\n",
|
||||||
|
" writer_prompt = f\"Using these research notes, write the report:\\n{research_notes}\"\n",
|
||||||
|
" writer_response = writer.generate_reply(messages=[{\"content\": writer_prompt, \"role\": \"user\"}])\n",
|
||||||
|
" report = writer_response if isinstance(writer_response, str) else writer_response.get(\"content\", \"[no output]\")\n",
|
||||||
|
" print(\"\\nDraft Report:\\n\", report)\n",
|
||||||
|
"\n",
|
||||||
|
" critic_prompt = f\"Review this report:\\n{report}\"\n",
|
||||||
|
" critic_response = critic.generate_reply(messages=[{\"content\": critic_prompt, \"role\": \"user\"}])\n",
|
||||||
|
" final_text = critic_response if isinstance(critic_response, str) else critic_response.get(\"content\", \"[no output]\")\n",
|
||||||
|
" print(\"\\nFinal Review:\\n\", final_text)\n",
|
||||||
|
" return final_text\n",
|
||||||
|
"\n",
|
||||||
|
"task = \"Research the latest advancements in quantum computing as of 2025. Gather key facts, then write a short report (200–300 words). Have the Critic review and finalize.\"\n",
|
||||||
|
"final_output = run_sequential(task)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cell 6 - MLFlow: connect to tracking server and list recent chat runs\n",
|
||||||
|
"\n",
|
||||||
|
"Installs MLflow, sets the tracking URI and experiment, then queries and prints the latest runs with key params/metrics (temperature, max_tokens, duration) to verify Serve logging."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip -q install mlflow==2.14.3 --disable-pip-version-check\n",
|
||||||
|
"\n",
|
||||||
|
"import os, mlflow\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"\n",
|
||||||
|
"tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n",
|
||||||
|
"mlflow.set_tracking_uri(tracking_uri)\n",
|
||||||
|
"print(f\"MLflow Tracking URI: {tracking_uri}\")\n",
|
||||||
|
"\n",
|
||||||
|
"exp_name = os.getenv(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\")\n",
|
||||||
|
"exp = mlflow.set_experiment(exp_name)\n",
|
||||||
|
"print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n",
|
||||||
|
"print(\"-\" * 60)\n",
|
||||||
|
"\n",
|
||||||
|
"client = mlflow.tracking.MlflowClient()\n",
|
||||||
|
"runs = client.search_runs(\n",
|
||||||
|
" exp.experiment_id, \n",
|
||||||
|
" order_by=[\"attributes.start_time DESC\"], \n",
|
||||||
|
" max_results=10\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"if not runs:\n",
|
||||||
|
" print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"\\nFound {len(runs)} recent runs:\")\n",
|
||||||
|
" print(\"-\" * 60)\n",
|
||||||
|
" \n",
|
||||||
|
" for i, run in enumerate(runs, 1):\n",
|
||||||
|
" start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n",
|
||||||
|
" duration = run.data.metrics.get('duration_ms', 'N/A')\n",
|
||||||
|
" temp = run.data.params.get('temperature', 'N/A')\n",
|
||||||
|
" max_tokens = run.data.params.get('max_tokens', 'N/A')\n",
|
||||||
|
" total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\nRun {i}:\")\n",
|
||||||
|
" print(f\" ID: {run.info.run_id[:12]}...\")\n",
|
||||||
|
" print(f\" Time: {start_time}\")\n",
|
||||||
|
" print(f\" Status: {run.info.status}\")\n",
|
||||||
|
" print(f\" Temperature: {temp}\")\n",
|
||||||
|
" print(f\" Max Tokens: {max_tokens}\")\n",
|
||||||
|
" print(f\" Duration: {duration} ms\")\n",
|
||||||
|
" print(f\" Total Tokens: {total_tokens}\")\n",
|
||||||
|
" \n",
|
||||||
|
" print(\"\\n\" + \"=\" * 60)\n",
|
||||||
|
" print(\"SUMMARY:\")\n",
|
||||||
|
" successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n",
|
||||||
|
" durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n",
|
||||||
|
" avg_duration = sum(durations) / len(durations) if durations else 0\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\" Total Runs: {len(runs)}\")\n",
|
||||||
|
" print(f\" Successful: {successful}\")\n",
|
||||||
|
" print(f\" Failed: {len(runs) - successful}\")\n",
|
||||||
|
" print(f\" Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \" Avg Duration: N/A\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n\" + \"=\" * 60)\n",
|
||||||
|
"print(\"MLflow verification complete\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
from ollama import Client
|
||||||
|
client = Client(
|
||||||
|
host='http://ai-starter-kit-ollama:11434',
|
||||||
|
headers={'x-some-header': 'some-value'}
|
||||||
|
)
|
||||||
|
response = client.chat(model='gemma3', messages=[
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Why is the sky blue?',
|
||||||
|
},
|
||||||
|
])
|
||||||
Loading…
Reference in New Issue