25 changed files with 1752 additions and 3856 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -2,21 +2,8 @@ name: Pre-commit
 on:
  pull_request:
    branches:
      - main
  push:
-    branches:
+    branches: [main]
      - main
 env:
  LC_ALL: en_US.UTF-8
 defaults:
  run:
    shell: bash
 permissions:
  contents: read
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@ -24,43 +11,27 @@ concurrency:
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
      - name: Harden Runner
-        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit
-      - name: Checkout containers/ramalama-stack
+      - name: Checkout code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: "3.11"
+          python-version: '3.11'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml
-      - name: Run pre-commit
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Verify if there are any diff files after pre-commit
        run: git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        run: |
-          unstaged_files=$(git ls-files --others --exclude-standard)
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
          if [ -n "$unstaged_files" ]; then
            echo "There are uncommitted new files, run pre-commit locally and commit again"
            echo "$unstaged_files"
            exit 1
          fi
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-name: Build and publish PyPI package
+name: Build, test, and upload PyPI package
 on:
  push:
@ -37,18 +37,16 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Harden Runner
-        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit
-      - name: Checkout containers/ramalama-stack
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
-          # https://github.com/actions/checkout/issues/249
+          # for setuptools-scm
          fetch-depth: 0
-      - name: Build and inspect python package
+      - uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0
        uses: hynek/build-and-inspect-python-package@c52c3a4710070b50470d903818a7b25115dcd076 # v2.13.0
      - name: Run 'test-build.sh'
        run: $GITHUB_WORKSPACE/tests/test-build.sh
@ -58,6 +56,7 @@ jobs:
  # - a PR is merged into main branch
  publish-test-pypi:
    name: Publish packages to test.pypi.org
    # environment: publish-test-pypi
    if: |
      github.repository_owner == 'containers' && (
        github.event.action == 'published' ||
@ -69,9 +68,10 @@ jobs:
      id-token: write
    runs-on: ubuntu-latest
    needs: build-package
    steps:
      - name: Harden Runner
-        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit
@ -91,6 +91,7 @@ jobs:
  # - a new GitHub release is published
  publish-pypi:
    name: Publish release to pypi.org
    # environment: publish-pypi
    if: |
      github.repository_owner == 'containers' && github.event.action == 'published'
    permissions:
@ -98,11 +99,13 @@ jobs:
      id-token: write
      # allow gh release upload
      contents: write
    runs-on: ubuntu-latest
    needs: build-package
    steps:
      - name: Harden Runner
-        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit
--- a/.github/workflows/test-container.yml
+++ b/.github/workflows/test-container.yml
@ -1,79 +0,0 @@
 name: Test Container
 on:
  workflow_dispatch:
    inputs:
      inference_model:
        description: Model to download and inference via RamaLama
        required: false
        default: llama3.2:3b
  schedule:
    - cron: '0 11 * * *' # Runs at 11AM UTC every morning
 env:
  LC_ALL: en_US.UTF-8
 defaults:
  run:
    shell: bash
 permissions:
  contents: read
 jobs:
  test-lls-integration:
    name: test-container
    runs-on: ubuntu-latest
    env:
      INFERENCE_MODEL: ${{ inputs.inference_model || 'llama3.2:3b' }}
    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
        with:
          egress-policy: audit
      - name: Checkout containers/ramalama-stack
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0
      - name: Install uv
        uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
        with:
          python-version: "3.11"
      - name: Set Up Environment and Install Dependencies
        run: |
          # install podman
          sudo apt-get -y install podman
          # install packaged version of ramalama
          uv venv
          uv pip install ramalama
      - name: Cache Ramalama store
        id: ramalama-store-cache
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
          path: ~/.local/share/ramalama
          key: ramalama-store-${{ env.INFERENCE_MODEL }}
      - name: Download model to serve with Ramalama
        if: ${{ steps.ramalama-store-cache.outputs.cache-hit != 'true' }}
        run: uv run ramalama pull ${{ env.INFERENCE_MODEL }}
      - name: Run 'test-container.sh'
        run: $GITHUB_WORKSPACE/tests/test-container.sh
      - name: Run 'test-ui-linux.sh'
        run: $GITHUB_WORKSPACE/tests/test-ui-linux.sh
      - name: Upload logs
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        if: always()
        with:
          name: logs-test-container
          retention-days: 5
          path: |
            **/*.log
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -2,78 +2,42 @@ name: Test External Providers
 on:
  workflow_dispatch:
    inputs:
      inference_model:
        description: Model to download and inference via RamaLama
        required: false
        default: llama3.2:3b-instruct-fp16
  push:
-    branches:
+    branches: [ main ]
      - main
    paths:
      - 'src/ramalama_stack/**'
      - 'tests/**'
      - '.github/workflows/test-external-providers.yml'
      - pyproject.toml
      - requirements.txt
      - uv.lock
  pull_request:
-    branches:
+    branches: [ main ]
      - main
    paths:
      - 'src/ramalama_stack/**'
      - 'tests/**'
      - '.github/workflows/test-external-providers.yml'
      - pyproject.toml
      - requirements.txt
      - uv.lock
 env:
  LC_ALL: en_US.UTF-8
 defaults:
  run:
    shell: bash
 permissions:
  contents: read
 jobs:
  test-external-providers:
    name: test-external-providers
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        inference_model:
          - 'llama3.2:3b'
          - 'granite3.2:2b'
    env:
-      INFERENCE_MODEL: ${{ matrix.inference_model }}
+      INFERENCE_MODEL: ${{ inputs.inference_model || 'llama3.2:3b-instruct-fp16' }}
    steps:
      - name: Harden Runner
-        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit
-      - name: Set INFERENCE_MODEL_NO_COLON for logging artifacts
+      - name: Checkout repository
        run: echo "INFERENCE_MODEL_NO_COLON=$(echo "$INFERENCE_MODEL" | tr ':' '_')" >> $GITHUB_ENV
      - name: Checkout containers/ramalama-stack
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0
      - name: Install uv
-        uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
+        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
        with:
-          python-version: "3.11"
+          python-version: "3.10"
      - name: Set Up Environment and Install Dependencies
        run: |
          uv sync
-
+          uv pip install -e .
          # temporary hack for file writing that should be done by the pip setup script
          # https://github.com/containers/ramalama-stack/issues/53
          mkdir -p ~/.llama/distributions/ramalama/
-          cp -r $GITHUB_WORKSPACE/src/ramalama_stack/providers.d/ ~/.llama/
+          cp -r src/ramalama_stack/providers.d/ ~/.llama/
-          cp $GITHUB_WORKSPACE/src/ramalama_stack/ramalama-run.yaml ~/.llama/distributions/ramalama/ramalama-run.yaml
+          cp src/ramalama_stack/ramalama-run.yaml ~/.llama/distributions/ramalama/ramalama-run.yaml
      - name: Run 'test-build.sh'
        run: $GITHUB_WORKSPACE/tests/test-build.sh
@ -92,14 +56,11 @@ jobs:
      - name: Run 'test-external-providers.sh'
        run: $GITHUB_WORKSPACE/tests/test-external-providers.sh
      - name: Run 'test-rag.sh'
        run: $GITHUB_WORKSPACE/tests/test-rag.sh
      - name: Upload logs
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        if: always()
        with:
-          name: logs-test-external-providers-${{ env.INFERENCE_MODEL_NO_COLON }}
+          name: logs
          retention-days: 5
          path: |
            **/*.log
--- a/.github/workflows/test-lls-integration.yml
+++ b/.github/workflows/test-lls-integration.yml
@ -1,86 +0,0 @@
 name: Test LLS Integration
 on:
  workflow_dispatch:
    inputs:
      inference_model:
        description: Model to download and inference via RamaLama
        required: false
        default: llama3.2:3b
  schedule:
    - cron: '0 11 * * *' # Runs at 11AM UTC every morning
 env:
  LC_ALL: en_US.UTF-8
 defaults:
  run:
    shell: bash
 permissions:
  contents: read
 jobs:
  test-lls-integration:
    name: test-lls-integration
    runs-on: ubuntu-latest
    env:
      INFERENCE_MODEL: ${{ inputs.inference_model || 'llama3.2:3b' }}
    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
        with:
          egress-policy: audit
      - name: Checkout containers/ramalama-stack
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0
      - name: Install uv
        uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
        with:
          python-version: "3.11"
      - name: Set Up Environment and Install Dependencies
        run: |
          uv venv
          # install packaged version of ramalama-stack
          uv pip install ramalama-stack
          # update llama-stack version to main branch
          uv pip install git+https://github.com/meta-llama/llama-stack.git@main
          # temporary hack for file writing that should be done by the pip setup script
          # https://github.com/containers/ramalama-stack/issues/53
          mkdir -p ~/.llama/distributions/ramalama/
          cp -r $GITHUB_WORKSPACE/src/ramalama_stack/providers.d/ ~/.llama/
          cp $GITHUB_WORKSPACE/src/ramalama_stack/ramalama-run.yaml ~/.llama/distributions/ramalama/ramalama-run.yaml
      - name: Run 'test-build.sh'
        run: $GITHUB_WORKSPACE/tests/test-build.sh
      - name: Cache Ramalama store
        id: ramalama-store-cache
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
          path: ~/.local/share/ramalama
          key: ramalama-store-${{ env.INFERENCE_MODEL }}
      - name: Download model to serve with Ramalama
        if: ${{ steps.ramalama-store-cache.outputs.cache-hit != 'true' }}
        run: uv run ramalama pull ${{ env.INFERENCE_MODEL }}
      - name: Run 'test-external-providers.sh'
        run: $GITHUB_WORKSPACE/tests/test-external-providers.sh
      - name: Upload logs
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        if: always()
        with:
          name: logs-test-lls-integration
          retention-days: 5
          path: |
            **/*.log
--- a/.gitignore
+++ b/.gitignore
@ -176,4 +176,3 @@ cython_debug/
 # Anything additional
 distributions/
 src/ramalama_stack/_version.py
 .python-version
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,14 +16,14 @@ repos:
    -   id: check-shebang-scripts-are-executable
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.12
+    rev: v0.11.7
    hooks:
    -   id: ruff
        args: [ --fix ]
    -   id: ruff-format
 -   repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.9
+    rev: 0.6.17
    hooks:
    -   id: uv-lock
    -   id: uv-export
@ -31,8 +31,6 @@ repos:
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
 -   repo: https://github.com/koalaman/shellcheck-precommit
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,316 +0,0 @@
 # Contributing to ramalama-stack
 We'd love to have you join the community!
 Below summarizes the processes that we follow.
 ## Topics
 `
 * [Reporting Issues](#reporting-issues)
 * [Working On Issues](#working-on-issues)
 * [Contributing To ramalama-stack](#contributing-to-ramalama-stack-1)
 * [Submitting Pull Requests](#submitting-pull-requests)
 * [Communications](#communications)
 * [Code of Conduct](#code-of-conduct)
 ## Reporting Issues
 Before reporting an issue, check our backlog of [open issues](https://github.com/containers/ramalama-stack/issues) to see if someone else has already reported it.
 If so, feel free to add your scenario, or additional information, to the discussion.
 Or simply "subscribe" to it to be notified when it is updated.
 Please do not add comments like "+1" or "I have this issue as well" without adding any new information.
 Instead, please add a thumbs-up emoji to the original report.
 Note: Older closed issues/PRs are automatically locked.
 If you have a similar problem please open a new issue instead of commenting.
 If you find a new issue with the project we'd love to hear about it!
 The most important aspect of a bug report is that it includes enough information for us to reproduce it.
 To make this easier, there are three types of issue templates you can use.
 * If you have a bug to report, please use *Bug Report* template.
 * If you have an idea to propose, please use the *Feature Request* template.
 * If your issue is something else, please use the default *Blank issue* template.
 Please include as much detail as possible, including all requested fields in the template.
 Not having all requested information makes it much harder to find and fix issues.
 A reproducer is the best thing you can include.
 Reproducers make finding and fixing issues much easier for maintainers.
 The easier it is for us to reproduce a bug, the faster it'll be fixed!
 Please don't include any private/sensitive information in your issue!
 Security issues should NOT be reported via Github and should instead be reported via the process described [here](https://github.com/containers/common/blob/main/SECURITY.md).
 ## Working On Issues
 Once you have decided to contribute to ramalama-stack by working on an issue, check our backlog of [open issues](https://github.com/containers/ramalama-stack/issues) looking for any that are unassigned.
 If you want to work on a specific issue that is already assigned but does not appear to be actively being worked on, please ping the assignee in the issue and ask if you can take over.
 If they do not respond after several days, you can notify a maintainer to have the issue reassigned.
 When working on an issue, please assign it to yourself.
 If you lack permissions to do so, you can ping the `@containers/ramalama-stack-maintainers` group to have a maintainer set you as assignee.
 ## Contributing To ramalama-stack
 This section describes how to make a contribution to ramalama-stack.
 ### Prepare your environment
 The minimum version of Python required to use ramalama-stack is Python 3.11
 ### Fork and clone ramalama-stack
 First, you need to fork this project on GitHub.
 Then clone your fork locally:
 ```shell
 $ git clone git@github.com:<you>/ramalama-stack
 $ cd ./ramalama-stack/
 ```
 ### Install required tools
 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
 You can install the dependencies by running:
 ```bash
 cd ramalama-stack
 uv sync
 source .venv/bin/activate
 ```
 > [!NOTE]
 > You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.11`)
 > Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 > For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 ### Adding dependencies
 Please add dependencies using the [uv-documented approach](https://docs.astral.sh/uv/concepts/projects/dependencies/#adding-dependencies).
 This should update both the `pyproject.toml` and the `uv.lock` file.
 The `requirements.txt` file should be updated as well by `pre-commit` - you can also do this manually via `uv export --frozen --no-hashes --no-emit-project --no-default-groups --output-file=requirements.txt`.
 ## Testing
 ramalama-stack provides a small suite of tests in the `test/` directory.
 Most pull requests should be accompanied by test changes covering the changes in the PR.
 Pull requests without tests will receive additional scrutiny from maintainers and may be blocked from merging unless tests are added.
 Maintainers will decide if tests are not necessary during review.
 ### Types of Tests
 There are several types of tests run by ramalama-stack's upstream CI.
 * Pre-commit checks
 * Functional testing
 * Integration testing
 * PyPI build and upload testing
 ## Documentation
 Make sure to update the documentation if needed.
 ramalama-stack is documented via its [README](https://github.com/containers/ramalama-stack/blob/main/docs/README.md) and files in the `docs/` directory.
 ## Submitting Pull Requests
 No Pull Request (PR) is too small!
 Typos, additional comments in the code, new test cases, bug fixes, new features, more documentation, ... it's all welcome!
 While bug fixes can first be identified via an "issue" in Github, that is not required.
 It's ok to just open up a PR with the fix, but make sure you include the same information you would have included in an issue - like how to reproduce it.
 PRs for new features should include some background on what use cases the new code is trying to address.
 When possible and when it makes sense, try to break up larger PRs into smaller ones - it's easier to review smaller code changes.
 But only if those smaller ones make sense as stand-alone PRs.
 Regardless of the type of PR, all PRs should include:
 * Well-documented code changes, both through comments in the code itself and high-quality commit messages.
 * Additional tests. Ideally, they should fail w/o your code change applied.
 * Documentation updates to reflect the changes made in the pull request.
 Squash your commits into logical pieces of work that might want to be reviewed separately from the rest of the PRs.
 Squashing down to just one commit is also acceptable since in the end the entire PR will be reviewed anyway.
 When in doubt, squash.
 When your PR fixes an issue, please note that by including `Fixes: #00000` in the commit description.
 More details on this are below, in the "Describe your changes in Commit Messages" section.
 The ramalama-stack repo follows a one-ack policy for merges.
 PRs will be approved and merged by a repo owner.
 Two reviews are required for a pull request to merge, including sourcery.ai
 ### Describe your Changes in Commit Messages
 Describe your problem.
 Whether your patch is a one-line bug fix or 5000 lines of a new feature, there must be an underlying problem that motivated you to do this work.
 Convince the reviewer that there is a problem worth fixing and that it makes sense for them to read past the first paragraph.
 Describe user-visible impact.
 Straight up crashes and lockups are pretty convincing, but not all bugs are that blatant.
 Even if the problem was spotted during code review, describe the impact you think it can have on users.
 Keep in mind that the majority of users run packages provided by distributions, so include anything that could help route your change downstream.
 Quantify optimizations and trade-offs.
 If you claim improvements in performance, memory consumption, stack footprint, or binary size, include
 numbers that back them up.
 But also describe non-obvious costs.
 Optimizations usually aren’t free but trade-offs between CPU, memory, and readability; or, when it comes to heuristics, between different workloads.
 Describe the expected downsides of your optimization so that the reviewer can weigh costs against
 benefits.
 Once the problem is established, describe what you are actually doing about it in technical detail.
 It’s important to describe the change in plain English for the reviewer to verify that the code is behaving as you intend it to.
 Solve only one problem per patch.
 If your description starts to get long, that’s a sign that you probably need to split up your patch.
 If the patch fixes a logged bug entry, refer to that bug entry by number and URL.
 If the patch follows from a mailing list discussion, give a URL to the mailing list archive.
 Please format these lines as `Fixes:` followed by the URL or, for Github bugs, the bug number preceded by a #.
 For example:
 ```
 Fixes: #00000
 Fixes: https://github.com/containers/ramalama-stack/issues/00000
 Fixes: https://issues.redhat.com/browse/RHEL-00000
 Fixes: RHEL-00000
 ```
 However, try to make your explanation understandable without external resources.
 In addition to giving a URL to a mailing list archive or bug, summarize the relevant points of the discussion that led to the patch as submitted.
 If you want to refer to a specific commit, don’t just refer to the SHA-1 ID of the commit.
 Please also include the one-line summary of the commit, to make it easier for reviewers to know what it is about. If the commit was merged in GitHub, referring to a GitHub PR number is also a good option, as that will retain all discussion from development, and makes including a summary less critical.
 Examples:
 ```
 Commit f641c2d9384e ("fix bug in rm -fa parallel deletes") [...]
 PR #00000
 ```
 When referring to a commit by SHA, you should also be sure to use at least the first twelve characters of the SHA-1 ID.
 The ramalama-stack repository holds a lot of objects, making collisions with shorter IDs a real possibility.
 Bear in mind that, even if there is no collision with your six-character ID now, that condition may change five years from now.
 The following git config settings can be used to add a pretty format for outputting the above style in the git log or git show commands:
 ```
 [core]
 	abbrev = 12
 [pretty]
 	fixes = Fixes: %h (\"%s\")
 ```
 ### Sign your PRs
 The sign-off is a line at the end of the explanation for the patch.
 Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch.
 The rules are simple: if you can certify the below (from [developercertificate.org](https://developercertificate.org/)):
 ```
 Developer Certificate of Origin
 Version 1.1
 Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 660 York Street, Suite 102,
 San Francisco, CA 94110 USA
 Everyone is permitted to copy and distribute verbatim copies of this
 license document, but changing it is not allowed.
 Developer's Certificate of Origin 1.1
 By making a contribution to this project, I certify that:
 (a) The contribution was created in whole or in part by me and I
    have the right to submit it under the open source license
    indicated in the file; or
 (b) The contribution is based upon previous work that, to the best
    of my knowledge, is covered under an appropriate open source
    license and I have the right under that license to submit that
    work with modifications, whether created in whole or in part
    by me, under the same open source license (unless I am
    permitted to submit under a different license), as indicated
    in the file; or
 (c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.
 (d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including all
    personal information I submit with it, including my sign-off) is
    maintained indefinitely and may be redistributed consistent with
    this project or the open source license(s) involved.
 ```
 Then you just add a line to every git commit message:
    Signed-off-by: Joe Smith <joe.smith@email.com>
 Use your real name (sorry, no pseudonyms or anonymous contributions).
 If you set your `user.name` and `user.email` git configs, you can sign your commit automatically with `git commit -s`.
 ### Continuous Integration
 All pull requests automatically run ramalama-stack's test suite.
 There is always additional complexity added by automation, and so it sometimes can fail for any number of reasons.
 This includes post-merge testing on all branches, which you may occasionally see [red bars on the status graph](https://github.com/containers/ramalama-stack/blob/main/docs/ci.md).
 Most notably, the tests will occasionally flake.
 If you see a single test on your PR has failed, and you do not believe it is caused by your changes, you can rerun the tests.
 If you lack permissions to rerun the tests, please ping the maintainers using the `@containers/ramalama-stack-maintainers` group and request that the failing test be rerun.
 If you see multiple test failures, you may wish to check the status graph mentioned above.
 When the graph shows mostly green bars on the right, it's a good indication the main branch is currently stable.
 Alternating red/green bars is indicative of a testing "flake", and should be examined (anybody can do this):
 * *One or a small handful of tests, on a single task, (i.e. specific distro/version)
  where all others ran successfully:*  Frequently the cause is networking or a brief
  external service outage.  The failed tasks may simply be re-run by pressing the
  corresponding button on the task details page.
 * *Multiple tasks failing*: Logically this should be due to some shared/common element.
  If that element is identifiable as a networking or external service (e.g. packaging
  repository outage), a re-run should be attempted.
 * *All tasks are failing*: If a common element is **not** identifiable as
  temporary (i.e. container registry outage), please seek assistance via
  [the methods below](#communications) as this may be early indication of
  a more serious problem.
 In the (hopefully) rare case there are multiple, contiguous red bars, this is
 a ***very bad*** sign.  It means additional merges are occurring despite an uncorrected
 or persistently faulty condition.  This risks additional bugs being introduced
 and further complication of necessary corrective measures.  Most likely people
 are aware and working on this, but it doesn't hurt [to confirm and/or try and help
 if possible.](#communications).
 ## Communications
 If you need help, you can contact the maintainers using the channels mentioned in RamaLama's [communications](https://github.com/containers/ramalama/blob/main/README.md#community) document.
 For discussions around issues/bugs and features, you can use the GitHub
 [issues](https://github.com/containers/ramalama-stack/issues)
 and
 [PRs](https://github.com/containers/ramalama-stack/pulls)
 tracking system.
 ## Code of Conduct
 As contributors and maintainers of the projects under the [Containers](https://github.com/containers) repository,
 and in the interest of fostering an open and welcoming community, we pledge to
 respect all people who contribute through reporting issues, posting feature
 requests, updating documentation, submitting pull requests or patches, and other
 activities to any of the projects under the containers umbrella. The full code of conduct guidelines can be
 found [here](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md).
 ### Bot Interactions
 ramalama-stack uses [sourcery.ai](https://sourcery.ai/) for AI code reviews.
 You can read their docs [here](https://docs.sourcery.ai/Code-Review/#interacting-with-sourcery) on how to interact with the bot.
--- a/README.md
+++ b/README.md
@ -1,80 +1,7 @@
 # ramalama-stack
 [![PyPI version](https://img.shields.io/pypi/v/ramalama_stack.svg)](https://pypi.org/project/ramalama-stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/ramalama-stack)](https://pypi.org/project/ramalama-stack/)
 [![License](https://img.shields.io/pypi/l/ramalama_stack.svg)](https://github.com/containers/ramalama-stack/blob/main/LICENSE)
-
+![Pre-Commit](https://github.com/containers/ramalama-stack/actions/workflows/pre-commit.yml/badge.svg?branch=main)
-An external provider for [Llama Stack](https://github.com/meta-llama/llama-stack) allowing for the use of [RamaLama](https://ramalama.ai/) for inference.
+![Test External Providers](https://github.com/containers/ramalama-stack/actions/workflows/test-external-providers.yml/badge.svg?branch=main)
-
+![PyPI](https://github.com/containers/ramalama-stack/actions/workflows/pypi.yml/badge.svg?branch=main)
 ## Installing
 You can install `ramalama-stack` from PyPI via `pip install ramalama-stack`
 This will install Llama Stack and RamaLama as well if they are not installed already.
 ## Usage
 > [!WARNING]
 > The following workaround is currently needed to run this provider - see https://github.com/containers/ramalama-stack/issues/53 for more details
 > ```bash
 > curl --create-dirs --output ~/.llama/providers.d/remote/inference/ramalama.yaml https://raw.githubusercontent.com/containers/ramalama-stack/refs/tags/v0.2.1/src/ramalama_stack/providers.d/remote/inference/ramalama.yaml
 > curl --create-dirs --output ~/.llama/distributions/ramalama/ramalama-run.yaml https://raw.githubusercontent.com/containers/ramalama-stack/refs/tags/v0.2.1/src/ramalama_stack/ramalama-run.yaml
 > ```
 1. First you will need a RamaLama server running - see [the RamaLama project](https://github.com/containers/ramalama) docs for more information.
 2. Ensure you set your `INFERENCE_MODEL` environment variable to the name of the model you have running via RamaLama.
 3. You can then run the RamaLama external provider via `llama stack run ~/.llama/distributions/ramalama/ramalama-run.yaml`
 > [!NOTE]
 > You can also run the RamaLama external provider inside of a container via [Podman](https://podman.io/)
 > ```bash
 > podman run \
 >  --net=host \
 >  --env RAMALAMA_URL=http://0.0.0.0:8080 \
 >  --env INFERENCE_MODEL=$INFERENCE_MODEL \
 >  quay.io/ramalama/llama-stack
 > ```
 This will start a Llama Stack server which will use port 8321 by default. You can test this works by configuring the Llama Stack Client to run against this server and
 sending a test request.
 - If your client is running on the same machine as the server, you can run `llama-stack-client configure --endpoint http://0.0.0.0:8321 --api-key none`
 - If your client is running on a different machine, you can run `llama-stack-client configure --endpoint http://<hostname>:8321 --api-key none`
 - The client should give you a message similar to `Done! You can now use the Llama Stack Client CLI with endpoint <endpoint>`
 - You can then test the server by running `llama-stack-client inference chat-completion --message "tell me a joke"` which should return something like
 ```bash
 ChatCompletionResponse(
    completion_message=CompletionMessage(
        content='A man walked into a library and asked the librarian, "Do you have any books on Pavlov\'s dogs
 and Schrödinger\'s cat?" The librarian replied, "It rings a bell, but I\'m not sure if it\'s here or not."',
        role='assistant',
        stop_reason='end_of_turn',
        tool_calls=[]
    ),
    logprobs=None,
    metrics=[
        Metric(metric='prompt_tokens', value=14.0, unit=None),
        Metric(metric='completion_tokens', value=63.0, unit=None),
        Metric(metric='total_tokens', value=77.0, unit=None)
    ]
 )
 ```
 ## Llama Stack User Interface
 Llama Stack includes an experimental user-interface, check it out
 [here](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distribution/ui).
 To deploy the UI, run this:
 ```bash
 podman run -d --rm --network=container:ramalama --name=streamlit quay.io/redhat-et/streamlit_client:0.1.0
 ```
 > [!NOTE]
 > If running on MacOS (not Linux), `--network=host` doesn't work. You'll need to publish additional ports `8321:8321` and `8501:8501` with the ramalama serve command,
 > then run with `network=container:ramalama`.
 >
 > If running on Linux use `--network=host` or `-p 8501:8501` instead. The streamlit container will be able to access the ramalama endpoint with either.
--- a/docs/ci.md
+++ b/docs/ci.md
@ -1,9 +0,0 @@
 # ramalama-stack CI
 | Job | Description | Status |
 | --- | ----------- | ------ |
 | [Pre-commit](https://github.com/containers/ramalama-stack/blob/main/.github/workflows/pre-commit.yml) | Runs pre-commit checks | ![Pre-commit](https://github.com/containers/ramalama-stack/actions/workflows/pre-commit.yml/badge.svg?branch=main) |
 | [Test External Providers](https://github.com/containers/ramalama-stack/blob/main/.github/workflows/test-external-providers.yml) | Tests the current `ramalama-stack` branch against the latest released versions of `ramalama` and `llama-stack` | ![Test External Providers](https://github.com/containers/ramalama-stack/actions/workflows/test-external-providers.yml/badge.svg?branch=main) |
 | [Test LLS Integration](https://github.com/containers/ramalama-stack/blob/main/.github/workflows/test-lls-integration.yml) | Tests the latest released versions of `ramalama` and `ramalama-stack` against the current `llama-stack` main branch | ![Test LLS Integration](https://github.com/containers/ramalama-stack/actions/workflows/test-lls-integration.yml/badge.svg?branch=main) |
 | [Test Container](https://github.com/containers/ramalama-stack/blob/main/.github/workflows/test-container.yml) | Tests the latest tagged container image of `ramalama/llama-stack` run via Podman | ![Test Container](https://github.com/containers/ramalama-stack/actions/workflows/test-container.yml/badge.svg?branch=main) |
 | [Build and publish PyPI package](https://github.com/containers/ramalama-stack/blob/main/.github/workflows/pypi.yml) | Builds, tests, and publishes `ramalama-stack` package | ![Build and publish PyPI package](https://github.com/containers/ramalama-stack/actions/workflows/pypi.yml/badge.svg?branch=main) |
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,48 +4,13 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ramalama-stack"
-description = "An external provider for Llama Stack allowing for the use of RamaLama for inference."
+description = "Llama Stack Provider for Ramalama Inference"
 authors = [{ name = "The RamaLama Stack Authors" }]
 readme = "README.md"
 license = "Apache-2.0"
 license-files = ["LICENSE"]
 keywords = ["ramalama", "llama", "AI"]
-requires-python = ">=3.11"
+requires-python = ">=3.10"
-dynamic = ["version"]
+dynamic = ["dependencies", "optional-dependencies", "version"]
 dependencies = [
    "aiohttp>=3.12.2",
    "aiosqlite>=0.21.0",
    "autoevals>=0.0.129",
    "blobfile>=3.0.0",
    "chardet>=3.0.0",
    "datasets>=3.6.0",
    "fastapi>=0.115.12",
    "httpx>=0.28.1",
    "llama-stack==0.2.9",
    "mcp>=1.9.2",
    "numpy>=2.2.6",
    "openai>=1.82.0",
    "opentelemetry-exporter-otlp-proto-http>=1.33.1",
    "opentelemetry-sdk>=1.33.1",
    "peft>=0.15.2",
    "psutil>=7.0.0",
    "pydantic>=2.11.5",
    "pymilvus>=2.5.10",
    "ramalama==0.9.0",
    "requests>=2.32.3",
    "sentence-transformers>=3.0.0",
    "six>=1.17.0",
    "sqlalchemy>=2.0.41",
    "torch>=2.7.0",
    "trl>=0.18.1",
    "urllib3>=2.4.0",
    "uvicorn>=0.34.2",
 ]
 [dependency-groups]
 dev = [
    "pre-commit>=3.0.4,<4.0",
 ]
 [project.urls]
 homepage = "https://ramalama.ai"
@ -64,5 +29,8 @@ include-package-data = true
 [tool.setuptools.package-data]
 "ramalama_stack" = ["providers.d/**/*", "ramalama-run.yaml"]
 [tool.setuptools.dynamic]
 dependencies = { file = ["requirements.txt"] }
 [tool.ruff]
 extend-exclude = ["*.ipynb"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -0,0 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 -r requirements.txt
 pre-commit>=3.0.4,<4.0
--- a/requirements.txt
+++ b/requirements.txt
@ -1,484 +1,18 @@
-# This file was autogenerated by uv via the following command:
+ramalama>=0.8.1
-#    uv export --frozen --no-hashes --no-emit-project --no-default-groups --output-file=requirements.txt
+llama-stack>=0.2.3
-accelerate==1.7.0
+urllib3
-    # via
+faiss-cpu
-    #   peft
+autoevals
-    #   trl
+six
-aiohappyeyeballs==2.6.1
+pydantic
-    # via aiohttp
+aiohttp
-aiohttp==3.12.7
+aiosqlite
-    # via
+datasets
-    #   fsspec
+fastapi
-    #   llama-stack
+httpx
-    #   ramalama-stack
+numpy
-aiosignal==1.3.2
+openai
-    # via aiohttp
+opentelemetry-exporter-otlp-proto-http
-aiosqlite==0.21.0
+opentelemetry-sdk
-    # via ramalama-stack
+requests
-annotated-types==0.7.0
+uvicorn
    # via pydantic
 anyio==4.9.0
    # via
    #   httpx
    #   llama-stack-client
    #   mcp
    #   openai
    #   sse-starlette
    #   starlette
 argcomplete==3.6.2
    # via ramalama
 attrs==25.3.0
    # via
    #   aiohttp
    #   jsonschema
    #   referencing
 autoevals==0.0.129
    # via ramalama-stack
 blobfile==3.0.0
    # via ramalama-stack
 braintrust-core==0.0.59
    # via autoevals
 certifi==2025.4.26
    # via
    #   httpcore
    #   httpx
    #   requests
 chardet==5.2.0
    # via ramalama-stack
 charset-normalizer==3.4.2
    # via requests
 chevron==0.14.0
    # via autoevals
 click==8.2.1
    # via
    #   llama-stack-client
    #   uvicorn
 colorama==0.4.6 ; sys_platform == 'win32'
    # via
    #   click
    #   tqdm
 datasets==3.6.0
    # via
    #   ramalama-stack
    #   trl
 deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
 dill==0.3.8
    # via
    #   datasets
    #   multiprocess
 distro==1.9.0
    # via
    #   llama-stack-client
    #   openai
 ecdsa==0.19.1
    # via python-jose
 fastapi==0.115.12
    # via ramalama-stack
 filelock==3.18.0
    # via
    #   blobfile
    #   datasets
    #   huggingface-hub
    #   torch
    #   transformers
 fire==0.7.0
    # via llama-stack
 frozenlist==1.6.0
    # via
    #   aiohttp
    #   aiosignal
 fsspec==2025.3.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
 googleapis-common-protos==1.70.0
    # via opentelemetry-exporter-otlp-proto-http
 greenlet==3.2.2 ; (python_full_version < '3.14' and platform_machine == 'AMD64') or (python_full_version < '3.14' and platform_machine == 'WIN32') or (python_full_version < '3.14' and platform_machine == 'aarch64') or (python_full_version < '3.14' and platform_machine == 'amd64') or (python_full_version < '3.14' and platform_machine == 'ppc64le') or (python_full_version < '3.14' and platform_machine == 'win32') or (python_full_version < '3.14' and platform_machine == 'x86_64')
    # via sqlalchemy
 grpcio==1.67.1
    # via pymilvus
 h11==0.16.0
    # via
    #   httpcore
    #   llama-stack
    #   uvicorn
 hf-xet==1.1.2 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
    # via huggingface-hub
 httpcore==1.0.9
    # via httpx
 httpx==0.28.1
    # via
    #   llama-stack
    #   llama-stack-client
    #   mcp
    #   openai
    #   ramalama-stack
 httpx-sse==0.4.0
    # via mcp
 huggingface-hub==0.32.4
    # via
    #   accelerate
    #   datasets
    #   llama-stack
    #   peft
    #   sentence-transformers
    #   tokenizers
    #   transformers
 idna==3.10
    # via
    #   anyio
    #   httpx
    #   requests
    #   yarl
 importlib-metadata==8.6.1
    # via opentelemetry-api
 jinja2==3.1.6
    # via
    #   llama-stack
    #   torch
 jiter==0.10.0
    # via openai
 joblib==1.5.1
    # via scikit-learn
 jsonschema==4.24.0
    # via
    #   autoevals
    #   llama-stack
 jsonschema-specifications==2025.4.1
    # via jsonschema
 llama-stack==0.2.9
    # via ramalama-stack
 llama-stack-client==0.2.9
    # via llama-stack
 lxml==5.4.0
    # via blobfile
 markdown-it-py==3.0.0
    # via rich
 markupsafe==3.0.2
    # via jinja2
 mcp==1.9.2
    # via ramalama-stack
 mdurl==0.1.2
    # via markdown-it-py
 milvus-lite==2.4.12 ; sys_platform != 'win32'
    # via pymilvus
 mpmath==1.3.0
    # via sympy
 multidict==6.4.4
    # via
    #   aiohttp
    #   yarl
 multiprocess==0.70.16
    # via datasets
 networkx==3.5
    # via torch
 numpy==2.2.6
    # via
    #   accelerate
    #   datasets
    #   pandas
    #   peft
    #   ramalama-stack
    #   scikit-learn
    #   scipy
    #   transformers
 nvidia-cublas-cu12==12.6.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
 nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cuda-runtime-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via
    #   nvidia-cusolver-cu12
    #   torch
 nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-nccl-cu12==2.26.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
 nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 openai==1.84.0
    # via
    #   llama-stack
    #   ramalama-stack
 opentelemetry-api==1.33.1
    # via
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
 opentelemetry-exporter-otlp-proto-common==1.33.1
    # via opentelemetry-exporter-otlp-proto-http
 opentelemetry-exporter-otlp-proto-http==1.33.1
    # via ramalama-stack
 opentelemetry-proto==1.33.1
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-http
 opentelemetry-sdk==1.33.1
    # via
    #   opentelemetry-exporter-otlp-proto-http
    #   ramalama-stack
 opentelemetry-semantic-conventions==0.54b1
    # via opentelemetry-sdk
 packaging==25.0
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   peft
    #   transformers
 pandas==2.2.3
    # via
    #   datasets
    #   llama-stack-client
    #   pymilvus
 peft==0.15.2
    # via ramalama-stack
 pillow==11.2.1
    # via
    #   llama-stack
    #   sentence-transformers
 polyleven==0.9.0
    # via autoevals
 prompt-toolkit==3.0.51
    # via
    #   llama-stack
    #   llama-stack-client
 propcache==0.3.1
    # via
    #   aiohttp
    #   yarl
 protobuf==5.29.5
    # via
    #   googleapis-common-protos
    #   opentelemetry-proto
    #   pymilvus
 psutil==7.0.0
    # via
    #   accelerate
    #   peft
    #   ramalama-stack
 pyaml==25.5.0
    # via llama-stack-client
 pyarrow==20.0.0
    # via datasets
 pyasn1==0.6.1
    # via
    #   python-jose
    #   rsa
 pycryptodomex==3.23.0
    # via blobfile
 pydantic==2.11.5
    # via
    #   fastapi
    #   llama-stack
    #   llama-stack-client
    #   mcp
    #   openai
    #   pydantic-settings
    #   ramalama-stack
 pydantic-core==2.33.2
    # via pydantic
 pydantic-settings==2.9.1
    # via mcp
 pygments==2.19.1
    # via rich
 pymilvus==2.5.10
    # via ramalama-stack
 python-dateutil==2.9.0.post0
    # via pandas
 python-dotenv==1.1.0
    # via
    #   llama-stack
    #   pydantic-settings
    #   pymilvus
 python-jose==3.5.0
    # via llama-stack
 python-multipart==0.0.20
    # via mcp
 pytz==2025.2
    # via pandas
 pyyaml==6.0.2
    # via
    #   accelerate
    #   autoevals
    #   datasets
    #   huggingface-hub
    #   peft
    #   pyaml
    #   transformers
 ramalama==0.9.0
    # via ramalama-stack
 referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
 regex==2024.11.6
    # via
    #   tiktoken
    #   transformers
 requests==2.32.3
    # via
    #   datasets
    #   huggingface-hub
    #   llama-stack
    #   opentelemetry-exporter-otlp-proto-http
    #   ramalama-stack
    #   tiktoken
    #   transformers
 rich==14.0.0
    # via
    #   llama-stack
    #   llama-stack-client
 rpds-py==0.25.1
    # via
    #   jsonschema
    #   referencing
 rsa==4.9.1
    # via python-jose
 safetensors==0.5.3
    # via
    #   accelerate
    #   peft
    #   transformers
 scikit-learn==1.7.0
    # via sentence-transformers
 scipy==1.15.3
    # via
    #   scikit-learn
    #   sentence-transformers
 sentence-transformers==4.1.0
    # via ramalama-stack
 setuptools==80.9.0
    # via
    #   llama-stack
    #   pymilvus
    #   torch
    #   triton
 six==1.17.0
    # via
    #   ecdsa
    #   python-dateutil
    #   ramalama-stack
 sniffio==1.3.1
    # via
    #   anyio
    #   llama-stack-client
    #   openai
 sqlalchemy==2.0.41
    # via ramalama-stack
 sse-starlette==2.3.6
    # via mcp
 starlette==0.46.2
    # via
    #   fastapi
    #   llama-stack
    #   mcp
 sympy==1.14.0
    # via torch
 termcolor==3.1.0
    # via
    #   fire
    #   llama-stack
    #   llama-stack-client
 threadpoolctl==3.6.0
    # via scikit-learn
 tiktoken==0.9.0
    # via llama-stack
 tokenizers==0.21.1
    # via transformers
 torch==2.7.0
    # via
    #   accelerate
    #   peft
    #   ramalama-stack
    #   sentence-transformers
 tqdm==4.67.1
    # via
    #   datasets
    #   huggingface-hub
    #   llama-stack-client
    #   milvus-lite
    #   openai
    #   peft
    #   sentence-transformers
    #   transformers
 transformers==4.52.4
    # via
    #   peft
    #   sentence-transformers
    #   trl
 triton==3.3.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
    # via torch
 trl==0.18.1
    # via ramalama-stack
 typing-extensions==4.14.0
    # via
    #   aiosqlite
    #   anyio
    #   fastapi
    #   huggingface-hub
    #   llama-stack-client
    #   openai
    #   opentelemetry-sdk
    #   pydantic
    #   pydantic-core
    #   referencing
    #   sentence-transformers
    #   sqlalchemy
    #   torch
    #   typing-inspection
 typing-inspection==0.4.1
    # via
    #   pydantic
    #   pydantic-settings
 tzdata==2025.2
    # via pandas
 ujson==5.10.0
    # via pymilvus
 urllib3==2.4.0
    # via
    #   blobfile
    #   ramalama-stack
    #   requests
 uvicorn==0.34.3
    # via
    #   mcp
    #   ramalama-stack
 wcwidth==0.2.13
    # via prompt-toolkit
 wrapt==1.17.2
    # via deprecated
 xxhash==3.5.0
    # via datasets
 yarl==1.20.0
    # via aiohttp
 zipp==3.22.0
    # via importlib-metadata
--- a/src/ramalama_stack/provider.py
+++ b/src/ramalama_stack/provider.py
@ -11,8 +11,8 @@ def get_provider_spec() -> ProviderSpec:
        api=Api.inference,
        adapter=AdapterSpec(
            adapter_type="ramalama",
-            pip_packages=["ramalama>=0.8.5", "pymilvus"],
+            pip_packages=["ramalama>=0.8.1", "faiss-cpu"],
            config_class="config.RamalamaImplConfig",
-            module="ramalama_stack",
+            module="ramalama_adapter",
        ),
    )
--- a/src/ramalama_stack/providers.d/remote/inference/ramalama.yaml
+++ b/src/ramalama_stack/providers.d/remote/inference/ramalama.yaml
@ -1,6 +1,6 @@
 adapter:
  adapter_type: ramalama
-  pip_packages: ["ramalama>=0.8.5", "pymilvus"]
+  pip_packages: ["ramalama>=0.8.1", "faiss-cpu"]
  config_class: ramalama_stack.config.RamalamaImplConfig
  module: ramalama_stack
 api_dependencies: []
--- a/src/ramalama_stack/ramalama-run.yaml
+++ b/src/ramalama_stack/ramalama-run.yaml
@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
 - post_training
 - safety
 - scoring
 - telemetry
@ -21,10 +20,13 @@ providers:
    provider_type: inline::sentence-transformers
    config: {}
  vector_io:
-  - provider_id: milvus
+  - provider_id: faiss
-    provider_type: inline::milvus
+    provider_type: inline::faiss
    config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ramalama}/milvus_store.db
+      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:distributions/ramalama}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -38,16 +40,13 @@ providers:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ramalama}/agents_store.db
      responses_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ramalama}/responses_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llamastack}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ramalama}/trace_store.db
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ramalama/trace_store.db}
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -82,13 +81,6 @@ providers:
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:}
  post_training:
  - provider_id: huggingface
    provider_type: inline::huggingface
    config:
      checkpoint_format: huggingface
      distributed_backend: null
      device: cpu
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -100,32 +92,20 @@ providers:
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:}
      max_results: 3
  - provider_id: code-interpreter
    provider_type: inline::code-interpreter
    config: {}
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
    config: {}
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
    config: {}
  - provider_id: wolfram-alpha
    provider_type: remote::wolfram-alpha
    config:
      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ramalama}/registry.db
 inference_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ramalama}/inference_store.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: ramalama
  model_type: llm
 - metadata:
    embedding_dimension: 384
  model_id: all-MiniLM-L6-v2
  provider_id: sentence-transformers
  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []
@ -136,8 +116,8 @@ tool_groups:
  provider_id: tavily-search
 - toolgroup_id: builtin::rag
  provider_id: rag-runtime
- toolgroup_id: builtin::wolfram_alpha
+- toolgroup_id: builtin::code_interpreter
-  provider_id: wolfram-alpha
+  provider_id: code-interpreter
 server:
  port: 8321
 external_providers_dir: ${env.EXTERNAL_PROVIDERS_DIR:~/.llama/providers.d}
--- a/src/ramalama_stack/ramalama_adapter.py
+++ b/src/ramalama_stack/ramalama_adapter.py
@ -191,6 +191,7 @@ class RamalamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        )
    async def register_model(self, model: Model) -> Model:
        model = await self.register_helper.register_model(model)
        res = await self.client.models.list()
        available_models = [m.id async for m in res]
        # Ramalama handles paths on MacOS and Linux differently
--- a/tests/test-container.sh
+++ b/tests/test-container.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 main() {
  echo "===> starting 'test-container'..."
  start_and_wait_for_ramalama_server
  test_ramalama_models
  test_ramalama_chat_completion
  start_and_wait_for_llama_stack_container
  test_llama_stack_models
  test_llama_stack_openai_models
  test_llama_stack_chat_completion
  test_llama_stack_openai_chat_completion
  echo "===> 'test-container' completed successfully!"
 }
 TEST_UTILS=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 # shellcheck disable=SC1091
 source "$TEST_UTILS/utils.sh"
 main "$@"
 exit 0
--- a/tests/test-external-providers.sh
+++ b/tests/test-external-providers.sh
@ -1,22 +1,118 @@
 #!/bin/bash
 function start_and_wait_for_ramalama_server {
  # Start ramalama serve in background with logging to 'ramalama.log'
  nohup uv run ramalama serve "$INFERENCE_MODEL" > ramalama.log 2>&1 &
  RAMALAMA_PID=$!
  echo "Started RamaLama with PID: $RAMALAMA_PID"
  # Wait for ramalama to be ready by doing a health check
  echo "Waiting for RamaLama server..."
  for i in {1..60}; do
    echo "Attempt $i to connect to RamaLama..."
    resp=$(curl -s http://localhost:8080/health)
    if [ "$resp" == '{"status":"ok"}' ]; then
      echo "RamaLama server is up and responding!"
      break
    fi
    if [ "$i" -eq 60 ]; then
      echo "RamaLama server failed to start or respond"
      echo "RamaLama logs:"
      cat ramalama.log
      exit 1
    fi
    sleep 1
  done
 }
 function start_and_wait_for_llama_stack_server {
  # Start llama stack run with logging to 'lls.log'
  LLAMA_STACK_LOG_FILE=lls.log nohup uv run llama stack run ~/.llama/distributions/ramalama/ramalama-run.yaml --image-type venv &
  LLS_PID=$!
  echo "Started Llama Stack with PID: $LLS_PID"
  # Wait for llama stack to be ready by doing a health check, then test for the ramalama provider
  echo "Waiting for Llama Stack server..."
  for i in {1..60}; do
    echo "Attempt $i to connect to Llama Stack..."
    resp=$(curl -s http://localhost:8321/v1/health)
    if [ "$resp" == '{"status":"OK"}' ]; then
      echo "Llama Stack server is up!"
      if grep -q -e "remote::ramalama from .*providers.d/remote/inference/ramalama.yaml" lls.log; then
        echo "Llama Stack server is using RamaLama provider"
        return
      else
        echo "Llama Stack server is not using RamaLama provider"
        echo "Server logs:"
        cat lls.log
        exit 1
      fi
    fi
    sleep 1
  done
  echo "Llama Stack server failed to start"
  echo "Server logs:"
  cat lls.log
  exit 1
 }
 function test_ramalama_chat_completion {
  echo "===> test_ramalama_chat_completion: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS -X POST http://localhost:8080/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d "{\"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}], \"model\": \"$INFERENCE_MODEL\"}")
  if echo "$resp" | grep -q "choices"; then
    echo "===> test_ramalama_chat_completion: pass"
    return
  else
    echo "===> test_ramalama_chat_completion: fail"
    echo "RamaLama logs:"
    cat ramalama.log
    exit 1
  fi
 }
 function test_llama_stack_chat_completion {
  echo "===> test_llama_stack_chat_completion: start"
  nohup uv run llama-stack-client configure --endpoint http://localhost:8321 --api-key none
  if nohup uv run llama-stack-client inference chat-completion --message "tell me a joke" | grep -q "completion_message"; then
    echo "===> test_llama_stack_chat_completion: pass"
    return
  else
    echo "===> test_llama_stack_chat_completion: fail"
    echo "Server logs:"
    cat lls.log
    exit 1
  fi
 }
 function test_llama_stack_openai_chat_completion {
  echo "===> test_llama_stack_openai_chat_completion: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS -X POST http://localhost:8321/v1/openai/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d "{\"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}], \"model\": \"$INFERENCE_MODEL\"}")
  if echo "$resp" | grep -q "choices"; then
    echo "===> test_llama_stack_openai_chat_completion: pass"
    return
  else
    echo "===> test_llama_stack_openai_chat_completion: fail"
    echo "Server logs:"
    cat lls.log
    exit 1
  fi
 }
 main() {
  echo "===> starting 'test-external-providers'..."
  start_and_wait_for_ramalama_server
  test_ramalama_models
  test_ramalama_chat_completion
  start_and_wait_for_llama_stack_server
  test_llama_stack_models
  test_llama_stack_openai_models
  test_llama_stack_chat_completion
  test_llama_stack_openai_chat_completion
  echo "===> 'test-external-providers' completed successfully!"
 }
 TEST_UTILS=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 # shellcheck disable=SC2153,SC2034
 INFERENCE_MODEL_NO_COLON=$(echo "$INFERENCE_MODEL" | tr ':' '_')
 # shellcheck disable=SC1091
 source "$TEST_UTILS/utils.sh"
 main "$@"
 exit 0
--- a/tests/test-rag.py
+++ b/tests/test-rag.py
@ -1,166 +0,0 @@
 import os
 import uuid
 from llama_stack_client import LlamaStackClient, RAGDocument
 def setup_client():
    """Initialize Llama Stack client with configuration"""
    base_url = "http://localhost:8321"
    client = LlamaStackClient(base_url=base_url, api_key="none", timeout=10.0)
    print(f"Connected to Llama Stack server at {base_url}")
    return client
 def setup_inference_params():
    """Configure inference parameters"""
    model_id = os.getenv(
        "INFERENCE_MODEL",
        "bartowski/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf",
    )
    temperature = float(os.getenv("TEMPERATURE", 0.0))
    if temperature > 0.0:
        top_p = float(os.getenv("TOP_P", 0.95))
        strategy = {"type": "top_p", "temperature": temperature, "top_p": top_p}
    else:
        strategy = {"type": "greedy"}
    max_tokens = int(os.getenv("MAX_TOKENS", 4096))
    sampling_params = {
        "strategy": strategy,
        "max_tokens": max_tokens,
    }
    stream_env = os.getenv("STREAM", "False")
    stream = stream_env == "True"
    print("Inference Parameters:")
    print(f"\tModel: {model_id}")
    print(f"\tSampling Parameters: {sampling_params}")
    print(f"\tStream: {stream}")
    return model_id, sampling_params, stream
 def setup_vector_db(client):
    """Setup vector database for RAG"""
    vector_db_id = f"test_vector_db_{uuid.uuid4().hex[:8]}"
    # Find embedding model from available models
    models = client.models.list()
    embedding_model = None
    for model in models:
        if hasattr(model, "model_type") and model.model_type == "embedding":
            embedding_model = model.identifier
            break
    if not embedding_model:
        raise Exception("No embedding model found")
    print(f"Using embedding model: {embedding_model}")
    # Register vector database
    client.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model,
        embedding_dimension=int(os.getenv("VDB_EMBEDDING_DIMENSION", 384)),
        provider_id=os.getenv("VDB_PROVIDER", "milvus"),
    )
    # Ingest simple test documents instead of external URLs
    test_content = [
        "RamaLama Stack is an external provider for Llama Stack that allows for the use of RamaLama for inference.",
        "Podman is a container management tool that provides a Docker-compatible command line interface without requiring a daemon.",
        "Podman can run containers rootlessly and provides robust security isolation.",
    ]
    documents = [
        RAGDocument(
            document_id=f"test_doc_{i}",
            content=content,
            mime_type="text/plain",
            metadata={"source": f"test_document_{i}"},
        )
        for i, content in enumerate(test_content)
    ]
    print(f"Ingesting {len(documents)} test documents into vector database...")
    client.tool_runtime.rag_tool.insert(
        documents=documents,
        vector_db_id=vector_db_id,
        chunk_size_in_tokens=int(os.getenv("VECTOR_DB_CHUNK_SIZE", 128)),
    )
    print(f"Vector database '{vector_db_id}' setup complete")
    return vector_db_id
 def run_rag_query(client, model_id, sampling_params, stream, vector_db_id, query):
    """Execute RAG query and return response"""
    print(f"\nUser> {query}")
    rag_response = client.tool_runtime.rag_tool.query(
        content=query, vector_db_ids=[vector_db_id]
    )
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    prompt_context = rag_response.content
    extended_prompt = f"Please answer the given query using the context below.\n\nCONTEXT:\n{prompt_context}\n\nQUERY:\n{query}"
    messages.append({"role": "user", "content": extended_prompt})
    response = client.inference.chat_completion(
        messages=messages,
        model_id=model_id,
        sampling_params=sampling_params,
        stream=stream,
    )
    print("inference> ", end="")
    if stream:
        for chunk in response:
            if hasattr(chunk, "event") and hasattr(chunk.event, "delta"):
                if hasattr(chunk.event.delta, "text"):
                    print(chunk.event.delta.text, end="")
        print()
    else:
        print(response.completion_message.content)
 def main():
    """Main function to run RAG test"""
    print("=== Llama Stack RAG Test ===")
    try:
        client = setup_client()
        model_id, sampling_params, stream = setup_inference_params()
        vector_db_id = setup_vector_db(client)
        queries = [
            "What is RamaLama Stack?",
            "What is Podman?",
            "Can Podman run in rootless mode?",
        ]
        print("\n=== Running RAG Queries ===")
        for query in queries:
            run_rag_query(
                client, model_id, sampling_params, stream, vector_db_id, query
            )
            print()
        print("=== RAG Test Complete ===")
    except Exception as e:
        print(f"Error: {e}")
        return 1
    return 0
 if __name__ == "__main__":
    exit(main())
--- a/tests/test-rag.sh
+++ b/tests/test-rag.sh
@ -1,40 +0,0 @@
 #!/bin/bash
 function test_rag_functionality {
  echo "===> test_rag_functionality: start"
  if uv run python tests/test-rag.py; then
    echo "===> test_rag_functionality: pass"
    return 0
  else
    echo "===> test_rag_functionality: fail"
    echo "RAG test script output above shows the failure details"
    return 1
  fi
 }
 main() {
  echo "===> starting 'test-rag'..."
  # Check if services are already running (from previous tests)
  if curl -s http://localhost:8321/v1/health >/dev/null 2>&1 && curl -s http://localhost:8080/health >/dev/null 2>&1; then
    echo "Using existing RamaLama and Llama Stack servers"
  else
    echo "Starting fresh servers for RAG test"
    start_and_wait_for_ramalama_server
    start_and_wait_for_llama_stack_server
  fi
  if test_rag_functionality; then
    echo "===> 'test-rag' completed successfully!"
  else
    echo "===> 'test-rag' failed!"
    exit 1
  fi
 }
 TEST_UTILS=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 # shellcheck disable=SC1091
 source "$TEST_UTILS/utils.sh"
 main "$@"
 exit 0
--- a/tests/test-ui-linux.sh
+++ b/tests/test-ui-linux.sh
@ -1,77 +0,0 @@
 #!/bin/bash
 function start_and_wait_for_streamlit_ui_linux {
  echo "Starting Streamlit UI for Linux..."
  podman run -d --rm --network=host --name=streamlit-ui quay.io/redhat-et/streamlit_client:0.1.0
  echo "Waiting for Streamlit UI to be ready..."
  for i in {1..30}; do
    echo "Attempt $i to connect to Streamlit UI..."
    if curl -s http://localhost:8501 >/dev/null 2>&1; then
      echo "Streamlit UI is up and responding on port 8501!"
      return 0
    fi
    if [ "$i" -eq 30 ]; then
      echo "Streamlit UI failed to start or respond"
      echo "Container logs:"
      podman logs streamlit-ui
      return 1
    fi
    sleep 2
  done
 }
 function test_streamlit_ui_linux {
  echo "===> test_streamlit_ui_linux: start"
  if start_and_wait_for_streamlit_ui_linux; then
    # Test that the UI is accessible and returns HTML content
    resp=$(curl -sS http://localhost:8501)
    if echo "$resp" | grep -q -i "streamlit\|html"; then
      echo "===> test_streamlit_ui_linux: pass"
      return 0
    else
      echo "===> test_streamlit_ui_linux: fail - UI not serving expected content"
      echo "Response: $resp"
      return 1
    fi
  else
    echo "===> test_streamlit_ui_linux: fail - UI failed to start"
    return 1
  fi
 }
 function cleanup_streamlit_ui {
  echo "Cleaning up Streamlit UI container..."
  podman rm -f streamlit-ui >/dev/null 2>&1 || true
 }
 main() {
  echo "===> starting 'test-ui-linux'..."
  # Only run on Linux
  # Need a fix to published ports in ramalama to run on MacOS
  if [[ "$OSTYPE" != "linux-gnu"* ]]; then
    echo "This test is only for Linux systems. Current OS: $OSTYPE"
    echo "===> 'test-ui-linux' skipped!"
    exit 0
  fi
  trap cleanup_streamlit_ui EXIT
  start_and_wait_for_ramalama_server
  start_and_wait_for_llama_stack_server
  test_streamlit_ui_linux
  cleanup_streamlit_ui
  echo "===> 'test-ui-linux' completed successfully!"
 }
 TEST_UTILS=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 # shellcheck disable=SC1091
 source "$TEST_UTILS/utils.sh"
 main "$@"
 exit 0
--- a/tests/utils.sh
+++ b/tests/utils.sh
@ -1,186 +0,0 @@
 #!/bin/bash
 function start_and_wait_for_ramalama_server {
  # Start ramalama serve in background with logging to 'ramalama-$INFERENCE_MODEL_NO_COLON.log'
  nohup uv run ramalama serve "$INFERENCE_MODEL" > "ramalama-$INFERENCE_MODEL_NO_COLON.log" 2>&1 &
  RAMALAMA_PID=$!
  echo "Started RamaLama with PID: $RAMALAMA_PID"
  # Wait for ramalama to be ready by doing a health check
  echo "Waiting for RamaLama server..."
  for i in {1..60}; do
    echo "Attempt $i to connect to RamaLama..."
    resp=$(curl -s http://localhost:8080/health)
    if [ "$resp" == '{"status":"ok"}' ]; then
      echo "RamaLama server is up and responding!"
      break
    fi
    if [ "$i" -eq 60 ]; then
      echo "RamaLama server failed to start or respond"
      echo "RamaLama logs:"
      cat "ramalama-$INFERENCE_MODEL_NO_COLON.log"
      exit 1
    fi
    sleep 1
  done
 }
 function start_and_wait_for_llama_stack_server {
  # Start llama stack run with logging to 'lls-$INFERENCE_MODEL_NO_COLON.log'
  LLAMA_STACK_LOG_FILE="lls-$INFERENCE_MODEL_NO_COLON.log" nohup uv run llama stack run ~/.llama/distributions/ramalama/ramalama-run.yaml --image-type venv &
  LLS_PID=$!
  echo "Started Llama Stack server with PID: $LLS_PID"
  # Wait for llama stack to be ready by doing a health check, then test for the ramalama provider
  echo "Waiting for Llama Stack server..."
  for i in {1..60}; do
    echo "Attempt $i to connect to Llama Stack..."
    resp=$(curl -s http://localhost:8321/v1/health)
    if [ "$resp" == '{"status":"OK"}' ]; then
      echo "Llama Stack server is up!"
      if grep -q -e "remote::ramalama from .*providers.d/remote/inference/ramalama.yaml" "lls-$INFERENCE_MODEL_NO_COLON.log"; then
        echo "Llama Stack server is using RamaLama provider"
        return
      else
        echo "Llama Stack server is not using RamaLama provider"
        echo "Server logs:"
        cat "lls-$INFERENCE_MODEL_NO_COLON.log"
        exit 1
      fi
    fi
    sleep 1
  done
  echo "Llama Stack server failed to start"
  echo "Server logs:"
  cat "lls-$INFERENCE_MODEL_NO_COLON.log"
  exit 1
 }
 function start_and_wait_for_llama_stack_container {
  # Start llama stack run
  podman run \
    -d \
    --net=host \
    --env INFERENCE_MODEL="$INFERENCE_MODEL" \
    --env RAMALAMA_URL=http://0.0.0.0:8080 \
    --name llama-stack \
    quay.io/ramalama/llama-stack:latest
  LLS_PID=$!
  echo "Started Llama Stack container with PID: $LLS_PID"
  # Wait for llama stack to be ready by doing a health check, then test for the ramalama provider
  echo "Waiting for Llama Stack server..."
  for i in {1..60}; do
    echo "Attempt $i to connect to Llama Stack..."
    resp=$(curl -s http://localhost:8321/v1/health)
    if [ "$resp" == '{"status":"OK"}' ]; then
      echo "Llama Stack server is up!"
      if podman logs llama-stack | grep -q -e "remote::ramalama from .*providers.d/remote/inference/ramalama.yaml"; then
        echo "Llama Stack server is using RamaLama provider"
        return
      else
        echo "Llama Stack server is not using RamaLama provider"
        echo "Container logs:"
        podman logs llama-stack
        exit 1
      fi
    fi
    sleep 1
  done
  echo "Llama Stack server failed to start"
  echo "Container logs:"
  podman logs llama-stack
  exit 1
 }
 function test_ramalama_models {
  echo "===> test_ramalama_models: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS http://localhost:8080/v1/models)
  if echo "$resp" | grep -q "$INFERENCE_MODEL"; then
    echo "===> test_ramalama_models: pass"
    return
  else
    echo "===> test_ramalama_models: fail"
    echo "RamaLama logs:"
    cat "ramalama-$INFERENCE_MODEL_NO_COLON.log"
    exit 1
  fi
 }
 function test_ramalama_chat_completion {
  echo "===> test_ramalama_chat_completion: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS -X POST http://localhost:8080/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d "{\"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}], \"model\": \"$INFERENCE_MODEL\"}")
  if echo "$resp" | grep -q "choices"; then
    echo "===> test_ramalama_chat_completion: pass"
    return
  else
    echo "===> test_ramalama_chat_completion: fail"
    echo "RamaLama logs:"
    cat "ramalama-$INFERENCE_MODEL_NO_COLON.log"
    exit 1
  fi
 }
 function test_llama_stack_models {
  echo "===> test_llama_stack_models: start"
  nohup uv run llama-stack-client configure --endpoint http://localhost:8321 --api-key none
  if nohup uv run llama-stack-client models list | grep -q "$INFERENCE_MODEL"; then
    echo "===> test_llama_stack_models: pass"
    return
  else
    echo "===> test_llama_stack_models: fail"
    echo "Server logs:"
    cat "lls-$INFERENCE_MODEL_NO_COLON.log" || podman logs llama-stack
    exit 1
  fi
 }
 function test_llama_stack_openai_models {
  echo "===> test_llama_stack_openai_models: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS http://localhost:8321/v1/openai/v1/models)
  if echo "$resp" | grep -q "$INFERENCE_MODEL"; then
    echo "===> test_llama_stack_openai_models: pass"
    return
  else
    echo "===> test_llama_stack_openai_models: fail"
    echo "Server logs:"
    cat "lls-$INFERENCE_MODEL_NO_COLON.log" || podman logs llama-stack
    exit 1
  fi
 }
 function test_llama_stack_chat_completion {
  echo "===> test_llama_stack_chat_completion: start"
  nohup uv run llama-stack-client configure --endpoint http://localhost:8321 --api-key none
  if nohup uv run llama-stack-client inference chat-completion --message "tell me a joke" | grep -q "completion_message"; then
    echo "===> test_llama_stack_chat_completion: pass"
    return
  else
    echo "===> test_llama_stack_chat_completion: fail"
    echo "Server logs:"
    cat "lls-$INFERENCE_MODEL_NO_COLON.log" || podman logs llama-stack
    exit 1
  fi
 }
 function test_llama_stack_openai_chat_completion {
  echo "===> test_llama_stack_openai_chat_completion: start"
  # shellcheck disable=SC2016
  resp=$(curl -sS -X POST http://localhost:8321/v1/openai/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d "{\"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}], \"model\": \"$INFERENCE_MODEL\"}")
  if echo "$resp" | grep -q "choices"; then
    echo "===> test_llama_stack_openai_chat_completion: pass"
    return
  else
    echo "===> test_llama_stack_openai_chat_completion: fail"
    echo "Server logs:"
    cat "lls-$INFERENCE_MODEL_NO_COLON.log" || podman logs llama-stack
    exit 1
  fi
 }
--- a/uv.lock
+++ b/uv.lock