merge previously added changes

2025-09-02 13:50:28 +02:00 · 2025-09-02 13:50:28 +02:00 · 83e0f75339
parent 889f72aba0 2e7f2975c0
commit 83e0f75339
8 changed files with 1013 additions and 18 deletions
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/Chart.yaml
@ -29,8 +29,8 @@ dependencies:
    condition: ray-cluster.enabled
    version: "1.3.0"
    repository: "https://ray-project.github.io/kuberay-helm"
-  - condition: ray-cluster.enabled
+  - name: ray-cluster
-    name: ray-cluster
+    condition: ray-cluster.enabled
    version: "1.3.0"
    repository: "https://ray-project.github.io/kuberay-helm"
  - name: jupyterhub
@ -39,3 +39,7 @@ dependencies:
  - name: mlflow
    version: "0.12.0"
    repository: "https://community-charts.github.io/helm-charts"
  - name: ollama
    condition: ollama.enabled
    version: "1.27.0"
    repository: "https://helm.otwld.com"
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/chat_bot.ipynb
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/chat_bot.ipynb
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/requirements.txt
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/files/requirements.txt
@ -5,3 +5,5 @@ huggingface_hub
 numpy
 ipywidgets
 mlflow
 ollama
 panel
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/values-gke.yaml
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/values-gke.yaml
@ -81,6 +81,9 @@ jupyterhub:
        - name: welcome-ipynb
          configMap:
            name: "ai-starter-kit-welcome-ipynb"
        - name: ray-ipynb
          configMap:
            name: "ai-starter-kit-ray-ipynb"
        - name: chat-bot-ipynb
          configMap:
            name: "ai-starter-kit-chat-bot-ipynb"
@ -91,6 +94,9 @@ jupyterhub:
        - name: download-models-py
          mountPath: /tmp/download_models.py
          subPath: download_models.py
        - name: ray-ipynb
          mountPath: /tmp/ray.ipynb
          subPath: ray.ipynb
        - name: chat-bot-ipynb
          mountPath: /tmp/chat_bot.ipynb
          subPath: chat_bot.ipynb
@ -102,8 +108,8 @@ jupyterhub:
          secretKeyRef:
            name: ai-starter-kit-hf-token-secret
            key: token
-      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
+      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
-      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
+      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
    cloudMetadata:
      # Without this disabled, the GKE Autopilot Warden will raise an error about container with escalated privilieges
      blockWithIptables: false
@ -138,12 +144,12 @@ ray-cluster:
    serviceType: ClusterIP
    resources:
      requests:
-        cpu: "1"
+        cpu: "4"
-        memory: "2G"
+        memory: "4G"
        ephemeral-storage: 10Gi
      limits:
-        cpu: "4"
+        cpu: "8"
-        memory: "8G"
+        memory: "6G"
        ephemeral-storage: 10Gi
    volumes:
    - name: ray-pvc-storage
@ -157,12 +163,12 @@ ray-cluster:
  worker:
    resources:
      requests:
-        cpu: "1"
+        cpu: "4"
-        memory: "2G"
+        memory: "4G"
        ephemeral-storage: 10Gi
      limits:
-        cpu: "4"
+        cpu: "8"
-        memory: "8G"
+        memory: "6G"
        ephemeral-storage: 10Gi
    volumes:
    - name: ray-pvc-storage
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/values-minikube.yaml
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/values-minikube.yaml
@ -38,6 +38,7 @@ jupyterhub:
            python /tmp/download_models.py
            # populate workspace with initial files
            if [ ! -f /home/jovyan/welcome.ipynb ]; then
              cp /tmp/welcome.ipynb /home/jovyan/welcome.ipynb
            fi
@ -46,10 +47,12 @@ jupyterhub:
            mountPath: /tmp/requirements.txt
            subPath: requirements.txt
            readOnly: true
            # This 'home' volume is created by the helm chart's 'homeMountPath' option. 
            # We mount it to initContainer too, so all downloads and installations are persisted in this mounted home folder.
          - name: home
            mountPath: /home/jovyan
            subPath: jupyterhub_workspace
-          - name: "download-models-py" 
+          - name: "download-models-py"
            mountPath: /tmp/download_models.py
            subPath: download_models.py
            readOnly: true
@ -78,6 +81,12 @@ jupyterhub:
        - name: welcome-ipynb
          configMap:
            name: "ai-starter-kit-welcome-ipynb"
        - name: ray-ipynb
          configMap:
            name: "ai-starter-kit-ray-ipynb"
        - name: chat-bot-ipynb
          configMap:
            name: "ai-starter-kit-chat-bot-ipynb"
      extraVolumeMounts:
        - name: requirements-txt
          mountPath: /tmp/requirements.txt
@ -85,16 +94,22 @@ jupyterhub:
        - name: download-models-py
          mountPath: /tmp/download_models.py
          subPath: download_models.py
        - name: ray-ipynb
          mountPath: /tmp/ray.ipynb
          subPath: ray.ipynb
        - name: chat-bot-ipynb
          mountPath: /tmp/chat_bot.ipynb
          subPath: chat_bot.ipynb
    # This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
    extraEnv:
      HF_TOKEN:
        name: HF_TOKEN
        valueFrom:
          secretKeyRef:
-            name: ai-starter-kit-hf-token-secret 
+            name: ai-starter-kit-hf-token-secret
            key: token
-      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
+      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
-      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
+      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
  hub:
    db:
      type: sqlite-pvc
--- a/ai/ai-starter-kit/helm-chart/ai-starter-kit/values.yaml
+++ b/ai/ai-starter-kit/helm-chart/ai-starter-kit/values.yaml
@ -81,6 +81,12 @@ jupyterhub:
        - name: welcome-ipynb
          configMap:
            name: "ai-starter-kit-welcome-ipynb"
        - name: ray-ipynb
          configMap:
            name: "ai-starter-kit-ray-ipynb"
        - name: chat-bot-ipynb
          configMap:
            name: "ai-starter-kit-chat-bot-ipynb"
      extraVolumeMounts:
        - name: requirements-txt
          mountPath: /tmp/requirements.txt
@ -88,6 +94,12 @@ jupyterhub:
        - name: download-models-py
          mountPath: /tmp/download_models.py
          subPath: download_models.py
        - name: ray-ipynb
          mountPath: /tmp/ray.ipynb
          subPath: ray.ipynb
        - name: chat-bot-ipynb
          mountPath: /tmp/chat_bot.ipynb
          subPath: chat_bot.ipynb
    # This environment variables list have its own format: https://z2jh.jupyter.org/en/latest/resources/reference.html#singleuser-extraenv
    extraEnv:
      HF_TOKEN:
@ -96,8 +108,8 @@ jupyterhub:
          secretKeyRef:
            name: ai-starter-kit-hf-token-secret
            key: token
-      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:6379"
+      RAY_ADDRESS: "ai-starter-kit-kuberay-head-svc:10001"
-      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow-tracking"
+      MLFLOW_TRACKING_URI: "http://ai-starter-kit-mlflow:5000"
  hub:
    db:
      type: sqlite-pvc
@ -162,6 +174,18 @@ localPersistence:
  # This path must match the destination path inside the minikube node.
  hostPath: "/tmp/models-cache"
 ollama:
  enabled: true
  ollama:
    models:
      pull:
        - gemma3
  persistentVolume:
    enabled: true
    existingClaim: "ai-starter-kit-models-cache-pvc"
    subPath: "ollama"
 ramalama:
  enabled: true
  command: ["sh", "-c" , "trap 'exit 0' TERM; while true; do sleep 60 & wait; done"]
--- a/ai/ai-starter-kit/notebooks/multi-agent.ipynb
+++ b/ai/ai-starter-kit/notebooks/multi-agent.ipynb
@ -0,0 +1,621 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "079fadd2-200e-4d37-8ae2-be2792e3a24e",
   "metadata": {},
   "source": [
    "### Cell 1 - Initialize Ray endpoints and verify dashboard\n",
    "\n",
    "Installs requests, derives the Ray head host from RAY_ADDRESS, builds Dashboard/Serve/MLflow URLs, reads an Hugging Face token, and prints the endpoints plus the Jobs API version for a quick health check."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip -q install requests==2.* --disable-pip-version-check\n",
    "\n",
    "import os, textwrap, base64, time, json, requests\n",
    "from string import Template\n",
    "\n",
    "raw_addr = os.getenv(\"RAY_ADDRESS\", \"ray://ai-starter-kit-kuberay-head-svc:10001\")\n",
    "if raw_addr.startswith(\"ray://\"):\n",
    "    HEAD_HOST = raw_addr.split(\"://\", 1)[1].split(\":\", 1)[0]\n",
    "else:\n",
    "    HEAD_HOST = raw_addr.split(\":\", 1)[0] or \"ai-starter-kit-kuberay-head-svc\"\n",
    "\n",
    "DASH_URL    = f\"http://{HEAD_HOST}:8265\"\n",
    "SERVE_PORT  = int(os.getenv(\"SERVE_PORT\", \"8000\"))\n",
    "SERVE_ROUTE = \"/v1\"\n",
    "\n",
    "HF_TOKEN_PATH = \"/etc/secrets/huggingface/token\"\n",
    "HF_TOKEN = \"\"\n",
    "if os.path.exists(HF_TOKEN_PATH):\n",
    "    try:\n",
    "        HF_TOKEN = open(HF_TOKEN_PATH).read().strip()\n",
    "    except Exception:\n",
    "        HF_TOKEN = \"\"\n",
    "\n",
    "print(\"Head host:\", HEAD_HOST)\n",
    "print(\"Jobs API :\", f\"{DASH_URL}/api/jobs/\")\n",
    "print(\"Serve URL:\", f\"http://{HEAD_HOST}:{SERVE_PORT}{SERVE_ROUTE}\")\n",
    "print(\"MLflow   :\", os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\"))\n",
    "\n",
    "print(\"Jobs API version:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe862173-fd9a-41ae-a27b-63875f788024",
   "metadata": {},
   "source": [
    "### Cell 2 - Deploy a minimal Ray Serve smoke test and verify readiness\n",
    "\n",
    "Submits a tiny FastAPI app to Ray Serve (one /healthz endpoint under /smoke) as a Ray Job, installing FastAPI on the fly. It polls the Jobs API for status and hits :8000/smoke/healthz up to 60 seconds, printing when the service responds 200 (i.e., smoke test passes)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34da3e26-6276-48b7-b3ac-c90359df6547",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, base64, textwrap, time, requests\n",
    "\n",
    "DASH_URL = \"http://ai-starter-kit-kuberay-head-svc:8265\"\n",
    "\n",
    "print(\"Jobs API:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n",
    "\n",
    "serve_py = textwrap.dedent(\"\"\"\n",
    "    from fastapi import FastAPI\n",
    "    from ray import serve\n",
    "    serve.start(detached=True, http_options={\"host\":\"0.0.0.0\",\"port\":8000})\n",
    "    app = FastAPI()\n",
    "\n",
    "    @serve.deployment(name=\"smoke\", num_replicas=1)\n",
    "    @serve.ingress(app)\n",
    "    class Smoke:\n",
    "        @app.get(\"/healthz\")\n",
    "        async def health(self): return {\"ok\": True}\n",
    "\n",
    "    serve.run(Smoke.bind(), route_prefix=\"/smoke\")\n",
    "    print(\"READY: smoke\", flush=True)\n",
    "\"\"\").strip()\n",
    "\n",
    "b64 = base64.b64encode(serve_py.encode()).decode()\n",
    "entry = f'python -c \"import base64; exec(base64.b64decode(\\'{b64}\\'))\"'\n",
    "submit = requests.post(f\"{DASH_URL}/api/jobs/\", json={\"entrypoint\": entry, \"runtime_env\": {\"pip\": [\"fastapi>=0.110\"]}}, timeout=60).json()\n",
    "job_id = submit[\"job_id\"]\n",
    "print(\"Job:\", job_id)\n",
    "\n",
    "svc = \"http://ai-starter-kit-kuberay-head-svc:8000/smoke/healthz\"\n",
    "for i in range(60):\n",
    "    s = requests.get(f\"{DASH_URL}/api/jobs/{job_id}\", timeout=10).json()[\"status\"]\n",
    "    try:\n",
    "        r = requests.get(svc, timeout=2)\n",
    "        print(f\"tick {i:02d}: job={s}, health={r.status_code}\")\n",
    "        if r.status_code == 200:\n",
    "            print(\"Smoke OK\")\n",
    "            break\n",
    "    except Exception as e:\n",
    "        print(f\"tick {i:02d}: job={s}, health=ERR {e}\")\n",
    "    time.sleep(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8111d705-595e-4e65-8479-bdc76191fa31",
   "metadata": {},
   "source": [
    "### Cell 3 - Deploy model on Ray Serve with llama-cpp\n",
    "\n",
    "Packages and submits a Ray Job that spins up a Ray Serve app exposing /v1/healthz and /v1/chat/completions. It downloads the preferred GGUF from Hugging Face, initializes llama-cpp-python, logs to MLflow, and prints the deployed health/chat URLs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbea1539-e9ab-460a-9cfc-20a42807f616",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, base64, textwrap, requests\n",
    "\n",
    "HEAD        = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
    "DASH_URL    = f\"http://{HEAD}:8265\"\n",
    "SERVE_PORT  = 8000\n",
    "SERVE_ROUTE = \"/v1\"\n",
    "\n",
    "runtime_env = {\n",
    "    \"pip\": [\n",
    "        \"fastapi==0.110.0\",\n",
    "        \"uvicorn==0.23.2\",\n",
    "        \"huggingface_hub==0.25.2\",\n",
    "        \"llama-cpp-python==0.3.16\",   \n",
    "        \"hf_transfer==0.1.6\",\n",
    "        \"mlflow==2.14.3\",            \n",
    "    ],\n",
    "    \"env_vars\": {\n",
    "        \"HF_HUB_ENABLE_HF_TRANSFER\": \"1\",\n",
    "        \"HUGGINGFACE_HUB_TOKEN\": os.environ.get(\"HUGGINGFACE_HUB_TOKEN\", \"\"),\n",
    "        \"SERVE_PORT\": str(SERVE_PORT),\n",
    "\n",
    "        \"MODEL_REPO\": \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\",\n",
    "        \"GGUF_PREF_ORDER\": \"q4_k_m,q4_0,q3_k_m,q2_k\",\n",
    "\n",
    "        \"LLM_CONTEXT\": os.environ.get(\"LLM_CONTEXT\", \"1024\"),\n",
    "        \"LLM_MAX_TOKENS\": os.environ.get(\"LLM_MAX_TOKENS\", \"256\"),\n",
    "        \"SERVER_MAX_NEW_TOKENS\": os.environ.get(\"SERVER_MAX_NEW_TOKENS\", \"512\"),\n",
    "\n",
    "        \"LLM_THREADS\": os.environ.get(\"LLM_THREADS\", \"6\"),\n",
    "        \"OMP_NUM_THREADS\": os.environ.get(\"OMP_NUM_THREADS\", \"6\"),\n",
    "        \"GPU_LAYERS\": \"0\",   \n",
    "        \n",
    "        \"PIP_PREFER_BINARY\": \"1\",\n",
    "        \"CMAKE_ARGS\": \"-DGGML_OPENMP=OFF -DLLAMA_NATIVE=OFF\",\n",
    "\n",
    "        \"HF_HOME\": \"/tmp/hf-cache\",\n",
    "        \"TRANSFORMERS_CACHE\": \"/tmp/hf-cache\",\n",
    "\n",
    "        \"MLFLOW_TRACKING_URI\": os.environ.get(\"MLFLOW_TRACKING_URI\", \"\"),\n",
    "        \"MLFLOW_EXPERIMENT_NAME\": os.environ.get(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\"),\n",
    "    },\n",
    "}\n",
    "\n",
    "serve_py = textwrap.dedent(f\"\"\"\n",
    "import os, time, multiprocessing, uuid\n",
    "from typing import List, Dict, Any\n",
    "from fastapi import FastAPI, Request\n",
    "from fastapi.responses import JSONResponse\n",
    "from huggingface_hub import HfApi, hf_hub_download\n",
    "from ray import serve\n",
    "from llama_cpp import Llama\n",
    "\n",
    "USE_MLFLOW = False\n",
    "try:\n",
    "    import mlflow\n",
    "    if os.getenv(\"MLFLOW_TRACKING_URI\"):\n",
    "        mlflow.set_tracking_uri(os.getenv(\"MLFLOW_TRACKING_URI\"))\n",
    "        mlflow.set_experiment(os.getenv(\"MLFLOW_EXPERIMENT_NAME\",\"ray-llama-cpp\"))\n",
    "        USE_MLFLOW = True\n",
    "except Exception as _e:\n",
    "    USE_MLFLOW = False\n",
    "\n",
    "SERVE_PORT  = int(os.getenv(\"SERVE_PORT\", \"{SERVE_PORT}\"))\n",
    "SERVE_ROUTE = \"{SERVE_ROUTE}\"\n",
    "MODEL_REPO  = os.getenv(\"MODEL_REPO\", \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\")\n",
    "GGUF_PREFS  = [s.strip() for s in os.getenv(\"GGUF_PREF_ORDER\",\"q4_k_m,q4_0,q3_k_m,q2_k\").split(\",\") if s.strip()]\n",
    "CTX_LEN     = int(os.getenv(\"LLM_CONTEXT\", \"2048\"))\n",
    "MAX_TOKENS  = int(os.getenv(\"LLM_MAX_TOKENS\", \"256\"))\n",
    "HF_TOKEN    = os.getenv(\"HUGGINGFACE_HUB_TOKEN\") or None\n",
    "\n",
    "serve.start(detached=True, http_options={{\"host\":\"0.0.0.0\", \"port\":SERVE_PORT}})\n",
    "app = FastAPI()\n",
    "\n",
    "def pick_one_file(repo_id: str, prefs):\n",
    "    api = HfApi()\n",
    "    files = api.list_repo_files(repo_id=repo_id, repo_type=\"model\", token=HF_TOKEN)\n",
    "    ggufs = [f for f in files if f.lower().endswith(\".gguf\")]\n",
    "    if not ggufs:\n",
    "        raise RuntimeError(f\"No .gguf files visible in {{repo_id}}\")\n",
    "    for pref in prefs:\n",
    "        for f in ggufs:\n",
    "            if pref.lower() in f.lower():\n",
    "                return f\n",
    "    return ggufs[0]\n",
    "\n",
    "def pick_chat_format(repo: str, fname: str) -> str:\n",
    "    return \"qwen\"\n",
    "\n",
    "@serve.deployment(name=\"qwen\", num_replicas=1, ray_actor_options={{\"num_cpus\": 6}})\n",
    "@serve.ingress(app)\n",
    "class OpenAICompatLlama:\n",
    "    def __init__(self, repo_id: str = MODEL_REPO):\n",
    "        target = pick_one_file(repo_id, GGUF_PREFS)\n",
    "        print(f\"[env] model repo: {{repo_id}} file: {{target}}\", flush=True)\n",
    "        local_dir = \"/tmp/hf-gguf\"; os.makedirs(local_dir, exist_ok=True)\n",
    "\n",
    "        gguf_path = hf_hub_download(\n",
    "            repo_id=repo_id, filename=target, token=HF_TOKEN,\n",
    "            local_dir=local_dir, local_dir_use_symlinks=False,\n",
    "            force_download=False, resume_download=True\n",
    "        )\n",
    "        print(f\"[download] done: {{gguf_path}}\", flush=True)\n",
    "\n",
    "        n_threads = int(os.getenv(\"LLM_THREADS\", max(2, (multiprocessing.cpu_count() or 4)//2)))\n",
    "        print(f\"[load] llama-cpp-python | ctx={{CTX_LEN}} threads={{n_threads}} gpu_layers={{int(os.getenv('GPU_LAYERS','0'))}}\", flush=True)\n",
    "\n",
    "        self.model_file = os.path.basename(gguf_path)\n",
    "        self.model_repo = repo_id\n",
    "        chat_format = pick_chat_format(self.model_repo, self.model_file)\n",
    "        print(f\"[load] chat_format={{chat_format}}\", flush=True)\n",
    "\n",
    "        self.llm = Llama(\n",
    "            model_path=gguf_path,\n",
    "            n_ctx=CTX_LEN,\n",
    "            n_threads=n_threads,\n",
    "            n_batch=256,                                \n",
    "            n_gpu_layers=int(os.getenv(\"GPU_LAYERS\",\"0\")),\n",
    "            chat_format=chat_format,\n",
    "            verbose=False\n",
    "        )\n",
    "        print(\"[ready] model loaded\", flush=True)\n",
    "\n",
    "    @app.get(\"/healthz\")\n",
    "    async def health(self):\n",
    "        return {{\"status\":\"ok\"}}\n",
    "\n",
    "    @app.post(\"/chat/completions\")\n",
    "    async def chat_completions(self, request: Request):\n",
    "        t0 = time.time()\n",
    "        body = await request.json()\n",
    "\n",
    "        messages    = body.get(\"messages\", [])\n",
    "        temperature = float(body.get(\"temperature\", 0.2))\n",
    "        req_max     = body.get(\"max_tokens\", None)\n",
    "        stop_words  = (body.get(\"stop\", []) or []) + [\"<|im_end|>\", \"</s>\"]\n",
    "\n",
    "        SERVER_MAX  = int(os.getenv(\"SERVER_MAX_NEW_TOKENS\", \"512\"))\n",
    "        max_tokens  = int(req_max if isinstance(req_max, int) else MAX_TOKENS)\n",
    "        max_tokens  = max(32, min(max_tokens, CTX_LEN - 128, SERVER_MAX))\n",
    "\n",
    "        rid = \"chatcmpl-\" + uuid.uuid4().hex[:24]\n",
    "        created = int(time.time())\n",
    "        model_name = f\"{{self.model_repo}}/{{self.model_file}}\"\n",
    "\n",
    "        try:\n",
    "            result = self.llm.create_chat_completion(\n",
    "                messages=messages,\n",
    "                temperature=temperature,\n",
    "                max_tokens=max_tokens,\n",
    "                top_k=50,\n",
    "                top_p=0.9,\n",
    "                repeat_penalty=1.1,\n",
    "                stop=stop_words,\n",
    "            )\n",
    "            out_text = (result[\"choices\"][0][\"message\"][\"content\"] or \"\").strip()\n",
    "            usage_raw = result.get(\"usage\") or {{}}\n",
    "            p_tokens = int(usage_raw.get(\"prompt_tokens\") or 0)\n",
    "            c_tokens = int(usage_raw.get(\"completion_tokens\") or 0)\n",
    "            err = None\n",
    "        except Exception as e:\n",
    "            out_text = \"\"\n",
    "            p_tokens = c_tokens = 0\n",
    "            err = str(e)\n",
    "\n",
    "        if USE_MLFLOW:\n",
    "            try:\n",
    "                dur_ms = int((time.time()-t0) * 1000)\n",
    "                with mlflow.start_run(run_name=\"chat\"):\n",
    "                    mlflow.set_tags({{\n",
    "                        \"model_repo\": self.model_repo,\n",
    "                        \"model_file\": self.model_file,\n",
    "                        \"framework\": \"llama-cpp-python\",\n",
    "                    }})\n",
    "                    mlflow.log_params({{\n",
    "                        \"temperature\": temperature,\n",
    "                        \"max_tokens\": max_tokens,\n",
    "                        \"ctx\": CTX_LEN,\n",
    "                    }})\n",
    "                    if not (p_tokens and c_tokens):\n",
    "                        p_tokens = p_tokens or max(1, len(\" \".join(m.get(\"content\",\"\") for m in messages).split()))\n",
    "                        c_tokens = c_tokens or max(0, len(out_text.split()))\n",
    "                    mlflow.log_metrics({{\n",
    "                        \"duration_ms\": dur_ms,\n",
    "                        \"prompt_tokens_approx\": p_tokens,\n",
    "                        \"completion_tokens_approx\": c_tokens,\n",
    "                        \"total_tokens_approx\": p_tokens + c_tokens,\n",
    "                    }})\n",
    "            except Exception:\n",
    "                pass\n",
    "\n",
    "        if err:\n",
    "            return JSONResponse(status_code=500, content={{\"error\": err, \"type\":\"generation_error\"}})\n",
    "\n",
    "        usage = {{\n",
    "            \"prompt_tokens\": p_tokens,\n",
    "            \"completion_tokens\": c_tokens,\n",
    "            \"total_tokens\": p_tokens + c_tokens,\n",
    "        }}\n",
    "        return {{\n",
    "            \"id\": rid,\n",
    "            \"object\": \"chat.completion\",\n",
    "            \"created\": created,\n",
    "            \"model\": model_name,\n",
    "            \"choices\": [\n",
    "                {{\n",
    "                    \"index\": 0,\n",
    "                    \"message\": {{\"role\":\"assistant\",\"content\": out_text}},\n",
    "                    \"finish_reason\": \"stop\"\n",
    "                }}\n",
    "            ],\n",
    "            \"usage\": usage\n",
    "        }}\n",
    "\n",
    "serve.run(OpenAICompatLlama.bind(), route_prefix=SERVE_ROUTE)\n",
    "print(\"READY\", flush=True)\n",
    "\"\"\").strip()\n",
    "\n",
    "payload = base64.b64encode(serve_py.encode()).decode()\n",
    "entrypoint = 'python -c \"import base64,sys;exec(base64.b64decode(\\'{}\\').decode())\"'.format(payload)\n",
    "\n",
    "job = requests.post(\n",
    "    f\"{DASH_URL}/api/jobs/\",\n",
    "    json={\n",
    "        \"entrypoint\": entrypoint,\n",
    "        \"runtime_env\": runtime_env,\n",
    "        \"metadata\": {\"job_name\": \"serve-qwen2_5-llama_cpp-openai\"},\n",
    "    },\n",
    "    timeout=45\n",
    ").json()\n",
    "\n",
    "print(\"Job:\", job.get(\"job_id\"))\n",
    "print(\"Health:\", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/healthz\")\n",
    "print(\"Chat:  \", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/chat/completions\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a411c015-c802-4ca1-81bb-3f4790d9626a",
   "metadata": {},
   "source": [
    "### Cell 4 - Basic client + latency test\n",
    "\n",
    "Calls /v1/healthz and then sends an OpenAI-style chat request to /v1/chat/completions with a short prompt. Prints latency and token usage, returning the assistant text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3be634e2-a82f-42c9-8e31-57e6868a86ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, time, requests, json\n",
    "\n",
    "HEAD       = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
    "SERVE_PORT = 8000\n",
    "BASE_URL   = f\"http://{HEAD}:{SERVE_PORT}/v1\"\n",
    "\n",
    "def health():\n",
    "    r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n",
    "    print(\"Health:\", r.status_code, r.json())\n",
    "\n",
    "def chat(prompt, temperature=0.4, max_tokens=220, stop=None):\n",
    "    body = {\n",
    "        \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n",
    "        \"temperature\": float(temperature),\n",
    "        \"max_tokens\": int(max_tokens),\n",
    "        \"messages\": [\n",
    "            {\"role\": \"system\", \"content\": \"You are Qwen2.5 Instruct running on a tiny CPU host. Be concise, complete sentences.\"},\n",
    "            {\"role\": \"user\", \"content\": prompt},\n",
    "        ],\n",
    "    }\n",
    "    if stop:\n",
    "        body[\"stop\"] = stop\n",
    "\n",
    "    t0 = time.time()\n",
    "    r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=300)\n",
    "    dt = time.time() - t0\n",
    "    r.raise_for_status()\n",
    "    out = r.json()[\"choices\"][0][\"message\"][\"content\"]\n",
    "    usage = r.json().get(\"usage\", {})\n",
    "    print(f\"\\nLatency: {dt:.2f}s  | usage: {usage}\")\n",
    "    print(\"\\n---\\n\", out)\n",
    "    return out\n",
    "\n",
    "health()\n",
    "_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\", stop=[\"<|im_end|>\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "553d2756-8949-43e3-8342-71387688e0fa",
   "metadata": {},
   "source": [
    "### Cell 5 - Multi-agent (Autogen) pipeline\n",
    "\n",
    "Installs Autogen, configures OpenAIWrapper to hit Ray Serve /v1 endpoint, warms up the model, then runs a simple three-agent workflow (Researcher -> Writer -> Critic) to produce and refine a short report."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip -q install pyautogen~=0.2.35 \"flaml[automl]\" --disable-pip-version-check\n",
    "\n",
    "import os, sys\n",
    "\n",
    "for p in [\n",
    "    \"/tmp/models-cache/lib/python3.11/site-packages\",               \n",
    "    os.path.expanduser(\"~/.local/lib/python3.11/site-packages\"), \n",
    "]:\n",
    "    if os.path.isdir(p) and p not in sys.path:\n",
    "        sys.path.insert(0, p)\n",
    "\n",
    "import os, autogen\n",
    "from autogen import AssistantAgent, UserProxyAgent\n",
    "\n",
    "HEAD        = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
    "SERVE_PORT  = 8000\n",
    "BASE_URL    = f\"http://{HEAD}:{SERVE_PORT}/v1\" \n",
    "\n",
    "config_list = [\n",
    "    {\n",
    "        \"model\": \"qwen2.5-1.5b-instruct-gguf\",  \n",
    "        \"base_url\": BASE_URL,            \n",
    "        \"api_key\": \"local\",               \n",
    "        \"price\": [0.0, 0.0],\n",
    "    }\n",
    "]\n",
    "\n",
    "llm = autogen.OpenAIWrapper(config_list=config_list)\n",
    "try:\n",
    "    r = llm.create(messages=[{\"role\":\"user\",\"content\":\"Say 'test ok'.\"}], temperature=0.2, max_tokens=16)\n",
    "    print(\"Warmup:\", r.choices[0].message.content)\n",
    "except Exception as e:\n",
    "    print(\"Warmup failed:\", e)\n",
    "\n",
    "user_proxy = UserProxyAgent(\n",
    "    name=\"UserProxy\",\n",
    "    system_message=\"You are the human admin. Initiate the task.\",\n",
    "    code_execution_config=False,\n",
    "    human_input_mode=\"NEVER\",\n",
    ")\n",
    "\n",
    "researcher = AssistantAgent(\n",
    "    name=\"Researcher\",\n",
    "    system_message=(\n",
    "        \"You are a researcher. Gather concise, verified facts on the topic. \"\n",
    "        \"Return several bullet points with inline source domains (e.g., nature.com, ibm.com). \"\n",
    "        \"Keep under 100 words total. No made-up sources. \"\n",
    "        \"Do not include any special end token.\"\n",
    "    ),\n",
    "    llm_config={\"config_list\": config_list, \"temperature\": 0.35, \"max_tokens\": 140, \"timeout\": 300},\n",
    ")\n",
    "\n",
    "writer = AssistantAgent(\n",
    "    name=\"Writer\",\n",
    "    system_message=(\n",
    "        \"You are a writer. Using the Researcher’s notes, produce a clear word report under 160 words. \"\n",
    "        \"Avoid speculation. Keep it structured and readable. \"\n",
    "        \"Do not include any special end token.\"\n",
    "    ),\n",
    "    llm_config={\"config_list\": config_list, \"temperature\": 0.55, \"max_tokens\": 220, \"timeout\": 180},\n",
    ")\n",
    "\n",
    "critic = AssistantAgent(\n",
    "    name=\"Critic\",\n",
    "    system_message=(\n",
    "        \"You are a critic. Review the Writer’s report for accuracy, clarity, and flow.\"\n",
    "        \"Present the tightened final text and keep it under 140 words. On a new last line output exactly: <|END|>\"\n",
    "    ),\n",
    "    llm_config={\"config_list\": config_list, \"temperature\": 0.45, \"max_tokens\": 160, \"timeout\": 300},\n",
    ")\n",
    "\n",
    "def run_sequential(task):\n",
    "    research_response = researcher.generate_reply(messages=[{\"content\": task, \"role\": \"user\"}])\n",
    "    research_notes = research_response if isinstance(research_response, str) else research_response.get(\"content\", \"[no output]\")\n",
    "    print(\"\\nResearch Notes:\\n\", research_notes)\n",
    "\n",
    "    writer_prompt = f\"Using these research notes, write the report:\\n{research_notes}\"\n",
    "    writer_response = writer.generate_reply(messages=[{\"content\": writer_prompt, \"role\": \"user\"}])\n",
    "    report = writer_response if isinstance(writer_response, str) else writer_response.get(\"content\", \"[no output]\")\n",
    "    print(\"\\nDraft Report:\\n\", report)\n",
    "\n",
    "    critic_prompt = f\"Review this report:\\n{report}\"\n",
    "    critic_response = critic.generate_reply(messages=[{\"content\": critic_prompt, \"role\": \"user\"}])\n",
    "    final_text = critic_response if isinstance(critic_response, str) else critic_response.get(\"content\", \"[no output]\")\n",
    "    print(\"\\nFinal Review:\\n\", final_text)\n",
    "    return final_text\n",
    "\n",
    "task = \"Research the latest advancements in quantum computing as of 2025. Gather key facts, then write a short report (200–300 words). Have the Critic review and finalize.\"\n",
    "final_output = run_sequential(task)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b",
   "metadata": {},
   "source": [
    "### Cell 6 - MLFlow: connect to tracking server and list recent chat runs\n",
    "\n",
    "Installs MLflow, sets the tracking URI and experiment, then queries and prints the latest runs with key params/metrics (temperature, max_tokens, duration) to verify Serve logging."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip -q install mlflow==2.14.3 --disable-pip-version-check\n",
    "\n",
    "import os, mlflow\n",
    "from datetime import datetime\n",
    "\n",
    "tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n",
    "mlflow.set_tracking_uri(tracking_uri)\n",
    "print(f\"MLflow Tracking URI: {tracking_uri}\")\n",
    "\n",
    "exp_name = os.getenv(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\")\n",
    "exp = mlflow.set_experiment(exp_name)\n",
    "print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n",
    "print(\"-\" * 60)\n",
    "\n",
    "client = mlflow.tracking.MlflowClient()\n",
    "runs = client.search_runs(\n",
    "    exp.experiment_id, \n",
    "    order_by=[\"attributes.start_time DESC\"], \n",
    "    max_results=10\n",
    ")\n",
    "\n",
    "if not runs:\n",
    "    print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n",
    "else:\n",
    "    print(f\"\\nFound {len(runs)} recent runs:\")\n",
    "    print(\"-\" * 60)\n",
    "    \n",
    "    for i, run in enumerate(runs, 1):\n",
    "        start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n",
    "        duration = run.data.metrics.get('duration_ms', 'N/A')\n",
    "        temp = run.data.params.get('temperature', 'N/A')\n",
    "        max_tokens = run.data.params.get('max_tokens', 'N/A')\n",
    "        total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')\n",
    "        \n",
    "        print(f\"\\nRun {i}:\")\n",
    "        print(f\"  ID:          {run.info.run_id[:12]}...\")\n",
    "        print(f\"  Time:        {start_time}\")\n",
    "        print(f\"  Status:      {run.info.status}\")\n",
    "        print(f\"  Temperature: {temp}\")\n",
    "        print(f\"  Max Tokens:  {max_tokens}\")\n",
    "        print(f\"  Duration:    {duration} ms\")\n",
    "        print(f\"  Total Tokens: {total_tokens}\")\n",
    "    \n",
    "    print(\"\\n\" + \"=\" * 60)\n",
    "    print(\"SUMMARY:\")\n",
    "    successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n",
    "    durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n",
    "    avg_duration = sum(durations) / len(durations) if durations else 0\n",
    "    \n",
    "    print(f\"  Total Runs: {len(runs)}\")\n",
    "    print(f\"  Successful: {successful}\")\n",
    "    print(f\"  Failed: {len(runs) - successful}\")\n",
    "    print(f\"  Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \"  Avg Duration: N/A\")\n",
    "\n",
    "print(\"\\n\" + \"=\" * 60)\n",
    "print(\"MLflow verification complete\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/ai/ai-starter-kit/notebooks/test_ollama.py
+++ b/ai/ai-starter-kit/notebooks/test_ollama.py
@ -0,0 +1,11 @@
 from ollama import Client
 client = Client(
  host='http://ai-starter-kit-ollama:11434',
  headers={'x-some-header': 'some-value'}
 )
 response = client.chat(model='gemma3', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
 ])