examples/ai/ai-starter-kit/notebooks/multi-agent.ipynb

622 lines
26 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "079fadd2-200e-4d37-8ae2-be2792e3a24e",
"metadata": {},
"source": [
"### Cell 1 - Initialize Ray endpoints and verify dashboard\n",
"\n",
"Installs requests, derives the Ray head host from RAY_ADDRESS, builds Dashboard/Serve/MLflow URLs, reads an Hugging Face token, and prints the endpoints plus the Jobs API version for a quick health check."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79db57cd-fb72-4b10-b0fb-5e9cd5c007b6",
"metadata": {},
"outputs": [],
"source": [
"!pip -q install requests==2.* --disable-pip-version-check\n",
"\n",
"import os, textwrap, base64, time, json, requests\n",
"from string import Template\n",
"\n",
"raw_addr = os.getenv(\"RAY_ADDRESS\", \"ray://ai-starter-kit-kuberay-head-svc:10001\")\n",
"if raw_addr.startswith(\"ray://\"):\n",
" HEAD_HOST = raw_addr.split(\"://\", 1)[1].split(\":\", 1)[0]\n",
"else:\n",
" HEAD_HOST = raw_addr.split(\":\", 1)[0] or \"ai-starter-kit-kuberay-head-svc\"\n",
"\n",
"DASH_URL = f\"http://{HEAD_HOST}:8265\"\n",
"SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"8000\"))\n",
"SERVE_ROUTE = \"/v1\"\n",
"\n",
"HF_TOKEN_PATH = \"/etc/secrets/huggingface/token\"\n",
"HF_TOKEN = \"\"\n",
"if os.path.exists(HF_TOKEN_PATH):\n",
" try:\n",
" HF_TOKEN = open(HF_TOKEN_PATH).read().strip()\n",
" except Exception:\n",
" HF_TOKEN = \"\"\n",
"\n",
"print(\"Head host:\", HEAD_HOST)\n",
"print(\"Jobs API :\", f\"{DASH_URL}/api/jobs/\")\n",
"print(\"Serve URL:\", f\"http://{HEAD_HOST}:{SERVE_PORT}{SERVE_ROUTE}\")\n",
"print(\"MLflow :\", os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\"))\n",
"\n",
"print(\"Jobs API version:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n"
]
},
{
"cell_type": "markdown",
"id": "fe862173-fd9a-41ae-a27b-63875f788024",
"metadata": {},
"source": [
"### Cell 2 - Deploy a minimal Ray Serve smoke test and verify readiness\n",
"\n",
"Submits a tiny FastAPI app to Ray Serve (one /healthz endpoint under /smoke) as a Ray Job, installing FastAPI on the fly. It polls the Jobs API for status and hits :8000/smoke/healthz up to 60 seconds, printing when the service responds 200 (i.e., smoke test passes)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34da3e26-6276-48b7-b3ac-c90359df6547",
"metadata": {},
"outputs": [],
"source": [
"import os, base64, textwrap, time, requests\n",
"\n",
"DASH_URL = \"http://ai-starter-kit-kuberay-head-svc:8265\"\n",
"\n",
"print(\"Jobs API:\", requests.get(f\"{DASH_URL}/api/version\", timeout=10).json())\n",
"\n",
"serve_py = textwrap.dedent(\"\"\"\n",
" from fastapi import FastAPI\n",
" from ray import serve\n",
" serve.start(detached=True, http_options={\"host\":\"0.0.0.0\",\"port\":8000})\n",
" app = FastAPI()\n",
"\n",
" @serve.deployment(name=\"smoke\", num_replicas=1)\n",
" @serve.ingress(app)\n",
" class Smoke:\n",
" @app.get(\"/healthz\")\n",
" async def health(self): return {\"ok\": True}\n",
"\n",
" serve.run(Smoke.bind(), route_prefix=\"/smoke\")\n",
" print(\"READY: smoke\", flush=True)\n",
"\"\"\").strip()\n",
"\n",
"b64 = base64.b64encode(serve_py.encode()).decode()\n",
"entry = f'python -c \"import base64; exec(base64.b64decode(\\'{b64}\\'))\"'\n",
"submit = requests.post(f\"{DASH_URL}/api/jobs/\", json={\"entrypoint\": entry, \"runtime_env\": {\"pip\": [\"fastapi>=0.110\"]}}, timeout=60).json()\n",
"job_id = submit[\"job_id\"]\n",
"print(\"Job:\", job_id)\n",
"\n",
"svc = \"http://ai-starter-kit-kuberay-head-svc:8000/smoke/healthz\"\n",
"for i in range(60):\n",
" s = requests.get(f\"{DASH_URL}/api/jobs/{job_id}\", timeout=10).json()[\"status\"]\n",
" try:\n",
" r = requests.get(svc, timeout=2)\n",
" print(f\"tick {i:02d}: job={s}, health={r.status_code}\")\n",
" if r.status_code == 200:\n",
" print(\"Smoke OK\")\n",
" break\n",
" except Exception as e:\n",
" print(f\"tick {i:02d}: job={s}, health=ERR {e}\")\n",
" time.sleep(1)"
]
},
{
"cell_type": "markdown",
"id": "8111d705-595e-4e65-8479-bdc76191fa31",
"metadata": {},
"source": [
"### Cell 3 - Deploy model on Ray Serve with llama-cpp\n",
"\n",
"Packages and submits a Ray Job that spins up a Ray Serve app exposing /v1/healthz and /v1/chat/completions. It downloads the preferred GGUF from Hugging Face, initializes llama-cpp-python, logs to MLflow, and prints the deployed health/chat URLs."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bbea1539-e9ab-460a-9cfc-20a42807f616",
"metadata": {},
"outputs": [],
"source": [
"import os, base64, textwrap, requests\n",
"\n",
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
"DASH_URL = f\"http://{HEAD}:8265\"\n",
"SERVE_PORT = 8000\n",
"SERVE_ROUTE = \"/v1\"\n",
"\n",
"runtime_env = {\n",
" \"pip\": [\n",
" \"fastapi==0.110.0\",\n",
" \"uvicorn==0.23.2\",\n",
" \"huggingface_hub==0.25.2\",\n",
" \"llama-cpp-python==0.3.16\", \n",
" \"hf_transfer==0.1.6\",\n",
" \"mlflow==2.14.3\", \n",
" ],\n",
" \"env_vars\": {\n",
" \"HF_HUB_ENABLE_HF_TRANSFER\": \"1\",\n",
" \"HUGGINGFACE_HUB_TOKEN\": os.environ.get(\"HUGGINGFACE_HUB_TOKEN\", \"\"),\n",
" \"SERVE_PORT\": str(SERVE_PORT),\n",
"\n",
" \"MODEL_REPO\": \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\",\n",
" \"GGUF_PREF_ORDER\": \"q4_k_m,q4_0,q3_k_m,q2_k\",\n",
"\n",
" \"LLM_CONTEXT\": os.environ.get(\"LLM_CONTEXT\", \"1024\"),\n",
" \"LLM_MAX_TOKENS\": os.environ.get(\"LLM_MAX_TOKENS\", \"256\"),\n",
" \"SERVER_MAX_NEW_TOKENS\": os.environ.get(\"SERVER_MAX_NEW_TOKENS\", \"512\"),\n",
"\n",
" \"LLM_THREADS\": os.environ.get(\"LLM_THREADS\", \"6\"),\n",
" \"OMP_NUM_THREADS\": os.environ.get(\"OMP_NUM_THREADS\", \"6\"),\n",
" \"GPU_LAYERS\": \"0\", \n",
" \n",
" \"PIP_PREFER_BINARY\": \"1\",\n",
" \"CMAKE_ARGS\": \"-DGGML_OPENMP=OFF -DLLAMA_NATIVE=OFF\",\n",
"\n",
" \"HF_HOME\": \"/tmp/hf-cache\",\n",
" \"TRANSFORMERS_CACHE\": \"/tmp/hf-cache\",\n",
"\n",
" \"MLFLOW_TRACKING_URI\": os.environ.get(\"MLFLOW_TRACKING_URI\", \"\"),\n",
" \"MLFLOW_EXPERIMENT_NAME\": os.environ.get(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\"),\n",
" },\n",
"}\n",
"\n",
"serve_py = textwrap.dedent(f\"\"\"\n",
"import os, time, multiprocessing, uuid\n",
"from typing import List, Dict, Any\n",
"from fastapi import FastAPI, Request\n",
"from fastapi.responses import JSONResponse\n",
"from huggingface_hub import HfApi, hf_hub_download\n",
"from ray import serve\n",
"from llama_cpp import Llama\n",
"\n",
"USE_MLFLOW = False\n",
"try:\n",
" import mlflow\n",
" if os.getenv(\"MLFLOW_TRACKING_URI\"):\n",
" mlflow.set_tracking_uri(os.getenv(\"MLFLOW_TRACKING_URI\"))\n",
" mlflow.set_experiment(os.getenv(\"MLFLOW_EXPERIMENT_NAME\",\"ray-llama-cpp\"))\n",
" USE_MLFLOW = True\n",
"except Exception as _e:\n",
" USE_MLFLOW = False\n",
"\n",
"SERVE_PORT = int(os.getenv(\"SERVE_PORT\", \"{SERVE_PORT}\"))\n",
"SERVE_ROUTE = \"{SERVE_ROUTE}\"\n",
"MODEL_REPO = os.getenv(\"MODEL_REPO\", \"Qwen/Qwen2.5-1.5B-Instruct-GGUF\")\n",
"GGUF_PREFS = [s.strip() for s in os.getenv(\"GGUF_PREF_ORDER\",\"q4_k_m,q4_0,q3_k_m,q2_k\").split(\",\") if s.strip()]\n",
"CTX_LEN = int(os.getenv(\"LLM_CONTEXT\", \"2048\"))\n",
"MAX_TOKENS = int(os.getenv(\"LLM_MAX_TOKENS\", \"256\"))\n",
"HF_TOKEN = os.getenv(\"HUGGINGFACE_HUB_TOKEN\") or None\n",
"\n",
"serve.start(detached=True, http_options={{\"host\":\"0.0.0.0\", \"port\":SERVE_PORT}})\n",
"app = FastAPI()\n",
"\n",
"def pick_one_file(repo_id: str, prefs):\n",
" api = HfApi()\n",
" files = api.list_repo_files(repo_id=repo_id, repo_type=\"model\", token=HF_TOKEN)\n",
" ggufs = [f for f in files if f.lower().endswith(\".gguf\")]\n",
" if not ggufs:\n",
" raise RuntimeError(f\"No .gguf files visible in {{repo_id}}\")\n",
" for pref in prefs:\n",
" for f in ggufs:\n",
" if pref.lower() in f.lower():\n",
" return f\n",
" return ggufs[0]\n",
"\n",
"def pick_chat_format(repo: str, fname: str) -> str:\n",
" return \"qwen\"\n",
"\n",
"@serve.deployment(name=\"qwen\", num_replicas=1, ray_actor_options={{\"num_cpus\": 6}})\n",
"@serve.ingress(app)\n",
"class OpenAICompatLlama:\n",
" def __init__(self, repo_id: str = MODEL_REPO):\n",
" target = pick_one_file(repo_id, GGUF_PREFS)\n",
" print(f\"[env] model repo: {{repo_id}} file: {{target}}\", flush=True)\n",
" local_dir = \"/tmp/hf-gguf\"; os.makedirs(local_dir, exist_ok=True)\n",
"\n",
" gguf_path = hf_hub_download(\n",
" repo_id=repo_id, filename=target, token=HF_TOKEN,\n",
" local_dir=local_dir, local_dir_use_symlinks=False,\n",
" force_download=False, resume_download=True\n",
" )\n",
" print(f\"[download] done: {{gguf_path}}\", flush=True)\n",
"\n",
" n_threads = int(os.getenv(\"LLM_THREADS\", max(2, (multiprocessing.cpu_count() or 4)//2)))\n",
" print(f\"[load] llama-cpp-python | ctx={{CTX_LEN}} threads={{n_threads}} gpu_layers={{int(os.getenv('GPU_LAYERS','0'))}}\", flush=True)\n",
"\n",
" self.model_file = os.path.basename(gguf_path)\n",
" self.model_repo = repo_id\n",
" chat_format = pick_chat_format(self.model_repo, self.model_file)\n",
" print(f\"[load] chat_format={{chat_format}}\", flush=True)\n",
"\n",
" self.llm = Llama(\n",
" model_path=gguf_path,\n",
" n_ctx=CTX_LEN,\n",
" n_threads=n_threads,\n",
" n_batch=256, \n",
" n_gpu_layers=int(os.getenv(\"GPU_LAYERS\",\"0\")),\n",
" chat_format=chat_format,\n",
" verbose=False\n",
" )\n",
" print(\"[ready] model loaded\", flush=True)\n",
"\n",
" @app.get(\"/healthz\")\n",
" async def health(self):\n",
" return {{\"status\":\"ok\"}}\n",
"\n",
" @app.post(\"/chat/completions\")\n",
" async def chat_completions(self, request: Request):\n",
" t0 = time.time()\n",
" body = await request.json()\n",
"\n",
" messages = body.get(\"messages\", [])\n",
" temperature = float(body.get(\"temperature\", 0.2))\n",
" req_max = body.get(\"max_tokens\", None)\n",
" stop_words = (body.get(\"stop\", []) or []) + [\"<|im_end|>\", \"</s>\"]\n",
"\n",
" SERVER_MAX = int(os.getenv(\"SERVER_MAX_NEW_TOKENS\", \"512\"))\n",
" max_tokens = int(req_max if isinstance(req_max, int) else MAX_TOKENS)\n",
" max_tokens = max(32, min(max_tokens, CTX_LEN - 128, SERVER_MAX))\n",
"\n",
" rid = \"chatcmpl-\" + uuid.uuid4().hex[:24]\n",
" created = int(time.time())\n",
" model_name = f\"{{self.model_repo}}/{{self.model_file}}\"\n",
"\n",
" try:\n",
" result = self.llm.create_chat_completion(\n",
" messages=messages,\n",
" temperature=temperature,\n",
" max_tokens=max_tokens,\n",
" top_k=50,\n",
" top_p=0.9,\n",
" repeat_penalty=1.1,\n",
" stop=stop_words,\n",
" )\n",
" out_text = (result[\"choices\"][0][\"message\"][\"content\"] or \"\").strip()\n",
" usage_raw = result.get(\"usage\") or {{}}\n",
" p_tokens = int(usage_raw.get(\"prompt_tokens\") or 0)\n",
" c_tokens = int(usage_raw.get(\"completion_tokens\") or 0)\n",
" err = None\n",
" except Exception as e:\n",
" out_text = \"\"\n",
" p_tokens = c_tokens = 0\n",
" err = str(e)\n",
"\n",
" if USE_MLFLOW:\n",
" try:\n",
" dur_ms = int((time.time()-t0) * 1000)\n",
" with mlflow.start_run(run_name=\"chat\"):\n",
" mlflow.set_tags({{\n",
" \"model_repo\": self.model_repo,\n",
" \"model_file\": self.model_file,\n",
" \"framework\": \"llama-cpp-python\",\n",
" }})\n",
" mlflow.log_params({{\n",
" \"temperature\": temperature,\n",
" \"max_tokens\": max_tokens,\n",
" \"ctx\": CTX_LEN,\n",
" }})\n",
" if not (p_tokens and c_tokens):\n",
" p_tokens = p_tokens or max(1, len(\" \".join(m.get(\"content\",\"\") for m in messages).split()))\n",
" c_tokens = c_tokens or max(0, len(out_text.split()))\n",
" mlflow.log_metrics({{\n",
" \"duration_ms\": dur_ms,\n",
" \"prompt_tokens_approx\": p_tokens,\n",
" \"completion_tokens_approx\": c_tokens,\n",
" \"total_tokens_approx\": p_tokens + c_tokens,\n",
" }})\n",
" except Exception:\n",
" pass\n",
"\n",
" if err:\n",
" return JSONResponse(status_code=500, content={{\"error\": err, \"type\":\"generation_error\"}})\n",
"\n",
" usage = {{\n",
" \"prompt_tokens\": p_tokens,\n",
" \"completion_tokens\": c_tokens,\n",
" \"total_tokens\": p_tokens + c_tokens,\n",
" }}\n",
" return {{\n",
" \"id\": rid,\n",
" \"object\": \"chat.completion\",\n",
" \"created\": created,\n",
" \"model\": model_name,\n",
" \"choices\": [\n",
" {{\n",
" \"index\": 0,\n",
" \"message\": {{\"role\":\"assistant\",\"content\": out_text}},\n",
" \"finish_reason\": \"stop\"\n",
" }}\n",
" ],\n",
" \"usage\": usage\n",
" }}\n",
"\n",
"serve.run(OpenAICompatLlama.bind(), route_prefix=SERVE_ROUTE)\n",
"print(\"READY\", flush=True)\n",
"\"\"\").strip()\n",
"\n",
"payload = base64.b64encode(serve_py.encode()).decode()\n",
"entrypoint = 'python -c \"import base64,sys;exec(base64.b64decode(\\'{}\\').decode())\"'.format(payload)\n",
"\n",
"job = requests.post(\n",
" f\"{DASH_URL}/api/jobs/\",\n",
" json={\n",
" \"entrypoint\": entrypoint,\n",
" \"runtime_env\": runtime_env,\n",
" \"metadata\": {\"job_name\": \"serve-qwen2_5-llama_cpp-openai\"},\n",
" },\n",
" timeout=45\n",
").json()\n",
"\n",
"print(\"Job:\", job.get(\"job_id\"))\n",
"print(\"Health:\", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/healthz\")\n",
"print(\"Chat: \", f\"http://{HEAD}:{SERVE_PORT}{SERVE_ROUTE}/chat/completions\")"
]
},
{
"cell_type": "markdown",
"id": "a411c015-c802-4ca1-81bb-3f4790d9626a",
"metadata": {},
"source": [
"### Cell 4 - Basic client + latency test\n",
"\n",
"Calls /v1/healthz and then sends an OpenAI-style chat request to /v1/chat/completions with a short prompt. Prints latency and token usage, returning the assistant text."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3be634e2-a82f-42c9-8e31-57e6868a86ee",
"metadata": {},
"outputs": [],
"source": [
"import os, time, requests, json\n",
"\n",
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
"SERVE_PORT = 8000\n",
"BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\"\n",
"\n",
"def health():\n",
" r = requests.get(f\"{BASE_URL}/healthz\", timeout=10)\n",
" print(\"Health:\", r.status_code, r.json())\n",
"\n",
"def chat(prompt, temperature=0.4, max_tokens=220, stop=None):\n",
" body = {\n",
" \"model\": \"qwen2.5-1.5b-instruct-gguf\",\n",
" \"temperature\": float(temperature),\n",
" \"max_tokens\": int(max_tokens),\n",
" \"messages\": [\n",
" {\"role\": \"system\", \"content\": \"You are Qwen2.5 Instruct running on a tiny CPU host. Be concise, complete sentences.\"},\n",
" {\"role\": \"user\", \"content\": prompt},\n",
" ],\n",
" }\n",
" if stop:\n",
" body[\"stop\"] = stop\n",
"\n",
" t0 = time.time()\n",
" r = requests.post(f\"{BASE_URL}/chat/completions\", json=body, timeout=300)\n",
" dt = time.time() - t0\n",
" r.raise_for_status()\n",
" out = r.json()[\"choices\"][0][\"message\"][\"content\"]\n",
" usage = r.json().get(\"usage\", {})\n",
" print(f\"\\nLatency: {dt:.2f}s | usage: {usage}\")\n",
" print(\"\\n---\\n\", out)\n",
" return out\n",
"\n",
"health()\n",
"_ = chat(\"Say 'test ok' then give me one short fun fact about llamas.\", stop=[\"<|im_end|>\"])"
]
},
{
"cell_type": "markdown",
"id": "553d2756-8949-43e3-8342-71387688e0fa",
"metadata": {},
"source": [
"### Cell 5 - Multi-agent (Autogen) pipeline\n",
"\n",
"Installs Autogen, configures OpenAIWrapper to hit Ray Serve /v1 endpoint, warms up the model, then runs a simple three-agent workflow (Researcher -> Writer -> Critic) to produce and refine a short report."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f6713f3-8b60-40b2-ad3c-ebf6db4f66e1",
"metadata": {},
"outputs": [],
"source": [
"!pip -q install pyautogen~=0.2.35 \"flaml[automl]\" --disable-pip-version-check\n",
"\n",
"import os, sys\n",
"\n",
"for p in [\n",
" \"/tmp/models-cache/lib/python3.11/site-packages\", \n",
" os.path.expanduser(\"~/.local/lib/python3.11/site-packages\"), \n",
"]:\n",
" if os.path.isdir(p) and p not in sys.path:\n",
" sys.path.insert(0, p)\n",
"\n",
"import os, autogen\n",
"from autogen import AssistantAgent, UserProxyAgent\n",
"\n",
"HEAD = os.environ.get(\"RAY_HEAD_SVC\", \"ai-starter-kit-kuberay-head-svc\")\n",
"SERVE_PORT = 8000\n",
"BASE_URL = f\"http://{HEAD}:{SERVE_PORT}/v1\" \n",
"\n",
"config_list = [\n",
" {\n",
" \"model\": \"qwen2.5-1.5b-instruct-gguf\", \n",
" \"base_url\": BASE_URL, \n",
" \"api_key\": \"local\", \n",
" \"price\": [0.0, 0.0],\n",
" }\n",
"]\n",
"\n",
"llm = autogen.OpenAIWrapper(config_list=config_list)\n",
"try:\n",
" r = llm.create(messages=[{\"role\":\"user\",\"content\":\"Say 'test ok'.\"}], temperature=0.2, max_tokens=16)\n",
" print(\"Warmup:\", r.choices[0].message.content)\n",
"except Exception as e:\n",
" print(\"Warmup failed:\", e)\n",
"\n",
"user_proxy = UserProxyAgent(\n",
" name=\"UserProxy\",\n",
" system_message=\"You are the human admin. Initiate the task.\",\n",
" code_execution_config=False,\n",
" human_input_mode=\"NEVER\",\n",
")\n",
"\n",
"researcher = AssistantAgent(\n",
" name=\"Researcher\",\n",
" system_message=(\n",
" \"You are a researcher. Gather concise, verified facts on the topic. \"\n",
" \"Return several bullet points with inline source domains (e.g., nature.com, ibm.com). \"\n",
" \"Keep under 100 words total. No made-up sources. \"\n",
" \"Do not include any special end token.\"\n",
" ),\n",
" llm_config={\"config_list\": config_list, \"temperature\": 0.35, \"max_tokens\": 140, \"timeout\": 300},\n",
")\n",
"\n",
"writer = AssistantAgent(\n",
" name=\"Writer\",\n",
" system_message=(\n",
" \"You are a writer. Using the Researchers notes, produce a clear word report under 160 words. \"\n",
" \"Avoid speculation. Keep it structured and readable. \"\n",
" \"Do not include any special end token.\"\n",
" ),\n",
" llm_config={\"config_list\": config_list, \"temperature\": 0.55, \"max_tokens\": 220, \"timeout\": 180},\n",
")\n",
"\n",
"critic = AssistantAgent(\n",
" name=\"Critic\",\n",
" system_message=(\n",
" \"You are a critic. Review the Writers report for accuracy, clarity, and flow.\"\n",
" \"Present the tightened final text and keep it under 140 words. On a new last line output exactly: <|END|>\"\n",
" ),\n",
" llm_config={\"config_list\": config_list, \"temperature\": 0.45, \"max_tokens\": 160, \"timeout\": 300},\n",
")\n",
"\n",
"def run_sequential(task):\n",
" research_response = researcher.generate_reply(messages=[{\"content\": task, \"role\": \"user\"}])\n",
" research_notes = research_response if isinstance(research_response, str) else research_response.get(\"content\", \"[no output]\")\n",
" print(\"\\nResearch Notes:\\n\", research_notes)\n",
"\n",
" writer_prompt = f\"Using these research notes, write the report:\\n{research_notes}\"\n",
" writer_response = writer.generate_reply(messages=[{\"content\": writer_prompt, \"role\": \"user\"}])\n",
" report = writer_response if isinstance(writer_response, str) else writer_response.get(\"content\", \"[no output]\")\n",
" print(\"\\nDraft Report:\\n\", report)\n",
"\n",
" critic_prompt = f\"Review this report:\\n{report}\"\n",
" critic_response = critic.generate_reply(messages=[{\"content\": critic_prompt, \"role\": \"user\"}])\n",
" final_text = critic_response if isinstance(critic_response, str) else critic_response.get(\"content\", \"[no output]\")\n",
" print(\"\\nFinal Review:\\n\", final_text)\n",
" return final_text\n",
"\n",
"task = \"Research the latest advancements in quantum computing as of 2025. Gather key facts, then write a short report (200300 words). Have the Critic review and finalize.\"\n",
"final_output = run_sequential(task)"
]
},
{
"cell_type": "markdown",
"id": "0af596cf-5ba6-42df-a030-61d7a20d6f7b",
"metadata": {},
"source": [
"### Cell 6 - MLFlow: connect to tracking server and list recent chat runs\n",
"\n",
"Installs MLflow, sets the tracking URI and experiment, then queries and prints the latest runs with key params/metrics (temperature, max_tokens, duration) to verify Serve logging."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03a1b042-04df-4cd0-9099-4cc763ecfe9d",
"metadata": {},
"outputs": [],
"source": [
"!pip -q install mlflow==2.14.3 --disable-pip-version-check\n",
"\n",
"import os, mlflow\n",
"from datetime import datetime\n",
"\n",
"tracking_uri = os.getenv(\"MLFLOW_TRACKING_URI\", \"http://ai-starter-kit-mlflow:5000\")\n",
"mlflow.set_tracking_uri(tracking_uri)\n",
"print(f\"MLflow Tracking URI: {tracking_uri}\")\n",
"\n",
"exp_name = os.getenv(\"MLFLOW_EXPERIMENT_NAME\", \"ray-llama-cpp\")\n",
"exp = mlflow.set_experiment(exp_name)\n",
"print(f\"Experiment: {exp.name} (ID: {exp.experiment_id})\")\n",
"print(\"-\" * 60)\n",
"\n",
"client = mlflow.tracking.MlflowClient()\n",
"runs = client.search_runs(\n",
" exp.experiment_id, \n",
" order_by=[\"attributes.start_time DESC\"], \n",
" max_results=10\n",
")\n",
"\n",
"if not runs:\n",
" print(\"No runs found. Run cells 4 or 5 first to generate inference requests.\")\n",
"else:\n",
" print(f\"\\nFound {len(runs)} recent runs:\")\n",
" print(\"-\" * 60)\n",
" \n",
" for i, run in enumerate(runs, 1):\n",
" start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')\n",
" duration = run.data.metrics.get('duration_ms', 'N/A')\n",
" temp = run.data.params.get('temperature', 'N/A')\n",
" max_tokens = run.data.params.get('max_tokens', 'N/A')\n",
" total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')\n",
" \n",
" print(f\"\\nRun {i}:\")\n",
" print(f\" ID: {run.info.run_id[:12]}...\")\n",
" print(f\" Time: {start_time}\")\n",
" print(f\" Status: {run.info.status}\")\n",
" print(f\" Temperature: {temp}\")\n",
" print(f\" Max Tokens: {max_tokens}\")\n",
" print(f\" Duration: {duration} ms\")\n",
" print(f\" Total Tokens: {total_tokens}\")\n",
" \n",
" print(\"\\n\" + \"=\" * 60)\n",
" print(\"SUMMARY:\")\n",
" successful = sum(1 for r in runs if r.info.status == 'FINISHED')\n",
" durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]\n",
" avg_duration = sum(durations) / len(durations) if durations else 0\n",
" \n",
" print(f\" Total Runs: {len(runs)}\")\n",
" print(f\" Successful: {successful}\")\n",
" print(f\" Failed: {len(runs) - successful}\")\n",
" print(f\" Avg Duration: {avg_duration:.1f} ms\" if avg_duration else \" Avg Duration: N/A\")\n",
"\n",
"print(\"\\n\" + \"=\" * 60)\n",
"print(\"MLflow verification complete\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}