Compare commits
37 Commits
Author | SHA1 | Date |
---|---|---|
|
54f2b31184 | |
|
2b726d8f90 | |
|
2ee90461d0 | |
|
b824525be3 | |
|
ab68d31a24 | |
|
53d2ea3789 | |
|
574fe407eb | |
|
8a91e6e59c | |
|
3e39d7234c | |
|
d08ff304cd | |
|
33ef5dc813 | |
|
f9dfde02fd | |
|
538dd357e6 | |
|
aeb5aa8b88 | |
|
19e37cd379 | |
|
875a920d4a | |
|
ef99fe1c54 | |
|
c66b0827a7 | |
|
ba7e934b21 | |
|
06655002c5 | |
|
b005def0a5 | |
|
f9e2e9bb31 | |
|
f96100fad5 | |
|
a929699e98 | |
|
bf2549856f | |
|
787010a637 | |
|
eb921d2b6f | |
|
7bdada58eb | |
|
494b0f474f | |
|
afcfe91dfa | |
|
cabfb2bc31 | |
|
d3c6dd985a | |
|
9cd4ac76a1 | |
|
d118bf8a26 | |
|
eff4b5791c | |
|
8b3a483269 | |
|
3c404de1b1 |
|
@ -30,6 +30,7 @@ VLLM_VERSION=$2
|
|||
VLLM_COMMIT=$3
|
||||
OLD=/tmp/orig_pr_body.txt
|
||||
NEW=/tmp/new_pr_body.txt
|
||||
FINAL=/tmp/final_pr_body.txt
|
||||
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
@ -41,16 +42,18 @@ sed -i '/- vLLM .*$/d' "${NEW}"
|
|||
echo ""
|
||||
echo "- vLLM version: $VLLM_VERSION"
|
||||
echo "- vLLM main: $VLLM_COMMIT"
|
||||
echo ""
|
||||
} >> "${NEW}"
|
||||
|
||||
# Remove redundant empty lines
|
||||
uniq "${NEW}" > "${FINAL}"
|
||||
|
||||
# Run this only if ${NEW} is different than ${OLD}
|
||||
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||
if ! cmp -s "${OLD}" "${FINAL}"; then
|
||||
echo
|
||||
echo "Updating PR body:"
|
||||
echo
|
||||
cat "${NEW}"
|
||||
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||
gh pr edit --body-file "${FINAL}" "${PR_NUMBER}"
|
||||
else
|
||||
echo "No changes needed"
|
||||
fi
|
||||
|
|
|
@ -146,11 +146,11 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
||||
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
||||
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
||||
apt-get update -y
|
||||
apt install git -y
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
|
@ -169,6 +169,23 @@ jobs:
|
|||
working-directory: ./vllm-empty
|
||||
run: VLLM_TARGET_DEVICE=empty pip install -e .
|
||||
|
||||
- name: Resolve vllm-ascend version
|
||||
run: |
|
||||
VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"
|
||||
|
||||
if [[ "$VERSION_INPUT" == "main" ]]; then
|
||||
TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
|
||||
LATEST_TAG=$(echo "$TAGS" | head -n1)
|
||||
if [[ -z "$LATEST_TAG" ]]; then
|
||||
RESOLVED_VERSION="main"
|
||||
else
|
||||
RESOLVED_VERSION="$LATEST_TAG"
|
||||
fi
|
||||
else
|
||||
RESOLVED_VERSION="$VERSION_INPUT"
|
||||
fi
|
||||
echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout vllm-project/vllm-ascend repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
@ -224,7 +241,6 @@ jobs:
|
|||
pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}'
|
||||
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
|
||||
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
|
||||
echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}"
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Print versions
|
||||
|
@ -386,4 +402,4 @@ jobs:
|
|||
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
|
||||
});
|
||||
core.info(`Created PR #${pr.data.number}`);
|
||||
|
||||
|
||||
|
|
|
@ -6,10 +6,10 @@ name: 'image / openEuler / 310p'
|
|||
# - push: ${{ github.event_name != 'pull_request' }} ==> false
|
||||
# 2. branches push trigger image publish
|
||||
# - is for branch/dev/nightly image
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main-310p-openeuler / vllm-ascend:*-dev-310p-openeuler
|
||||
# 3. tags push trigger image publish
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-openeuler|latest / vllm-ascend:v1.2.3rc1-openeuler
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-310p-openeuler / vllm-ascend:v1.2.3rc1-310p-openeuler
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
|
@ -33,7 +33,7 @@ on:
|
|||
tags:
|
||||
- 'v*'
|
||||
paths:
|
||||
- '.github/workflows/image_310p.openeuler.yml'
|
||||
- '.github/workflows/image_310p_openeuler.yml'
|
||||
- 'Dockerfile.310p.openEuler'
|
||||
- 'vllm_ascend/**'
|
||||
|
||||
|
@ -63,16 +63,18 @@ jobs:
|
|||
# Note for test case
|
||||
# https://github.com/marketplace/actions/docker-metadata-action#typeref
|
||||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-310p-openeuler is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1-openeuler, latest
|
||||
# - pre/post/dev: v0.7.1rc1-openeuler/v0.7.1rc1-openeuler/v0.7.1rc1.dev1-openeuler/v0.7.1.post1-openeuler, no latest
|
||||
# - v0.7.1 --> v0.7.1-310p-openeuler
|
||||
# - pre/post/dev: v0.7.1rc1-310p-openeuler/v0.7.1rc1-310p-openeuler/v0.7.1rc1.dev1-310p-openeuler/v0.7.1.post1-310p-openeuler, no latest
|
||||
# which follow the rule from vLLM with prefix v
|
||||
# TODO(yikun): the post release might be considered as latest release
|
||||
tags: |
|
||||
type=ref,event=branch,suffix=-310p-openeuler
|
||||
type=ref,event=pr,suffix=-openeuler
|
||||
type=ref,event=pr,suffix=-310p-openeuler
|
||||
type=pep440,pattern={{raw}},suffix=-310p-openeuler
|
||||
flavor:
|
||||
latest=false
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
|
@ -112,3 +114,4 @@ jobs:
|
|||
file: Dockerfile.310p.openEuler
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
||||
|
|
|
@ -6,10 +6,10 @@ name: 'image / Ubuntu / 310p'
|
|||
# - push: ${{ github.event_name != 'pull_request' }} ==> false
|
||||
# 2. branches push trigger image publish
|
||||
# - is for branch/dev/nightly image
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main-310p / vllm-ascend:*-dev-310p
|
||||
# 3. tags push trigger image publish
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3|latest / vllm-ascend:v1.2.3rc1
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-310p / vllm-ascend:v1.2.3rc1-310p
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
|
@ -61,14 +61,16 @@ jobs:
|
|||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1, latest
|
||||
# - pre/post/dev: v0.7.1rc1/v0.7.1rc1/v0.7.1rc1.dev1/v0.7.1.post1, no latest
|
||||
# - v0.7.1 --> v0.7.1-310p
|
||||
# - pre/post/dev: v0.7.1rc1-310p/v0.7.1rc1-310p/v0.7.1rc1.dev1-310p/v0.7.1.post1-310p, no latest
|
||||
# which follow the rule from vLLM with prefix v
|
||||
# TODO(yikun): the post release might be considered as latest release
|
||||
tags: |
|
||||
type=ref,event=branch,suffix=-310p
|
||||
type=ref,event=pr,suffix=-310p
|
||||
type=pep440,pattern={{raw}},suffix=-310p
|
||||
flavor:
|
||||
latest=false
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
|
@ -108,3 +110,4 @@ jobs:
|
|||
tags: ${{ steps.meta.outputs.tags }}
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
|
@ -0,0 +1,117 @@
|
|||
name: 'image / openEuler / a3'
|
||||
# This is a docker build check and publish job:
|
||||
# 1. PR Triggered docker image build check
|
||||
# - is for image build check
|
||||
# - Enable on main/*-dev branch
|
||||
# - push: ${{ github.event_name != 'pull_request' }} ==> false
|
||||
# 2. branches push trigger image publish
|
||||
# - is for branch/dev/nightly image
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# 3. tags push trigger image publish
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-a3-openeuler / vllm-ascend:v1.2.3rc1-a3-openeuler
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- 'main'
|
||||
- '*-dev'
|
||||
paths:
|
||||
- '.github/workflows/image_a3_openeuler.yml'
|
||||
- 'Dockerfile.a3.openEuler'
|
||||
- 'vllm_ascend/**'
|
||||
- 'setup.py'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- 'cmake/**'
|
||||
- 'CMakeLists.txt'
|
||||
- 'csrc/**'
|
||||
push:
|
||||
# Publish image when tagging, the Dockerfile in tag will be build as tag image
|
||||
branches:
|
||||
- 'main'
|
||||
- '*-dev'
|
||||
tags:
|
||||
- 'v*'
|
||||
paths:
|
||||
- '.github/workflows/image_a3_openeuler.yml'
|
||||
- 'Dockerfile.a3.openEuler'
|
||||
- 'vllm_ascend/**'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: vllm-ascend image build
|
||||
runs-on: >-
|
||||
${{
|
||||
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
||||
'ubuntu-latest' ||
|
||||
'ubuntu-24.04-arm'
|
||||
}}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Print
|
||||
run: |
|
||||
lscpu
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
# TODO(yikun): add more hub image and a note on release policy for container image
|
||||
images: |
|
||||
quay.io/ascend/vllm-ascend
|
||||
# Note for test case
|
||||
# https://github.com/marketplace/actions/docker-metadata-action#typeref
|
||||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-a3-openeuler is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1-a3-openeuler
|
||||
# - pre/post/dev: v0.7.1rc1-a3-openeuler/v0.7.1rc1-a3-openeuler/v0.7.1rc1.dev1-a3-openeuler/v0.7.1.post1-a3-openeuler, no latest
|
||||
# which follow the rule from vLLM with prefix v
|
||||
# TODO(yikun): the post release might be considered as latest release
|
||||
tags: |
|
||||
type=ref,event=branch,suffix=-a3-openeuler
|
||||
type=ref,event=pr,suffix=-a3-openeuler
|
||||
type=pep440,pattern={{raw}},suffix=-a3-openeuler
|
||||
flavor:
|
||||
latest=false
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
docker-images: false
|
||||
|
||||
- name: Build - Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build - Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Publish - Login to Quay Container Registry
|
||||
if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ vars.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_PASSWORD }}
|
||||
|
||||
- name: Build and push a3
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
platforms: >-
|
||||
${{
|
||||
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
||||
'linux/amd64,linux/arm64' ||
|
||||
'linux/arm64'
|
||||
}}
|
||||
# use the current repo path as the build context, ensure .git is contained
|
||||
context: .
|
||||
# only trigger when tag, branch/main push
|
||||
push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
file: Dockerfile.a3.openEuler
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
name: 'image / Ubuntu / a3'
|
||||
# This is a docker build check and publish job:
|
||||
# 1. PR Triggered docker image build check
|
||||
# - is for image build check
|
||||
# - Enable on main/*-dev branch
|
||||
# - push: ${{ github.event_name != 'pull_request' }} ==> false
|
||||
# 2. branches push trigger image publish
|
||||
# - is for branch/dev/nightly image
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# 3. tags push trigger image publish
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-a3|vllm-ascend:v1.2.3rc1-a3
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- 'main'
|
||||
- '*-dev'
|
||||
paths:
|
||||
- '.github/workflows/image_a3_ubuntu.yml'
|
||||
- 'Dockerfile.a3'
|
||||
- 'vllm_ascend/**'
|
||||
- 'setup.py'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- 'cmake/**'
|
||||
- 'CMakeLists.txt'
|
||||
- 'csrc/**'
|
||||
push:
|
||||
# Publish image when tagging, the Dockerfile in tag will be build as tag image
|
||||
branches:
|
||||
- 'main'
|
||||
- '*-dev'
|
||||
tags:
|
||||
- 'v*'
|
||||
paths:
|
||||
- '.github/workflows/image_a3_ubuntu.yml'
|
||||
- 'Dockerfile.a3'
|
||||
- 'vllm_ascend/**'
|
||||
jobs:
|
||||
|
||||
build:
|
||||
name: vllm-ascend image build
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Print
|
||||
run: |
|
||||
lscpu
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
# TODO(yikun): add more hub image and a note on release policy for container image
|
||||
images: |
|
||||
quay.io/ascend/vllm-ascend
|
||||
# Note for test case
|
||||
# https://github.com/marketplace/actions/docker-metadata-action#typeref
|
||||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-a3 is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1-a3
|
||||
# - pre/post/dev: v0.7.1rc1-a3/v0.7.1rc1-a3/v0.7.1rc1.dev1-a3/v0.7.1.post1-a3, no latest
|
||||
# which follow the rule from vLLM with prefix v
|
||||
# TODO(yikun): the post release might be considered as latest release
|
||||
tags: |
|
||||
type=ref,event=branch,suffix=-a3
|
||||
type=ref,event=pr,suffix=-a3
|
||||
type=pep440,pattern={{raw}},suffix=-a3
|
||||
flavor:
|
||||
latest=false
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
docker-images: false
|
||||
|
||||
- name: Build - Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Build - Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Publish - Login to Quay Container Registry
|
||||
if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ vars.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_PASSWORD }}
|
||||
|
||||
- name: Build and push a3
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
platforms: >-
|
||||
${{
|
||||
github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
|
||||
'linux/amd64,linux/arm64' ||
|
||||
'linux/amd64'
|
||||
}}
|
||||
# use the current repo path as the build context, ensure .git is contained
|
||||
context: .
|
||||
file: Dockerfile.a3
|
||||
# only trigger when tag, branch/main push
|
||||
push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
||||
|
|
@ -6,10 +6,9 @@ name: 'image / openEuler'
|
|||
# - push: ${{ github.event_name != 'pull_request' }} ==> false
|
||||
# 2. branches push trigger image publish
|
||||
# - is for branch/dev/nightly image
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# 3. tags push trigger image publish
|
||||
# - commits are merge into main/*-dev ==> vllm-ascend:main-openeuler / vllm-ascend:*-dev-openeuler
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-openeuler|latest / vllm-ascend:v1.2.3rc1-openeuler
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-openeuler / vllm-ascend:v1.2.3rc1-openeuler
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
|
@ -65,7 +64,7 @@ jobs:
|
|||
# 1. branch job pulish per main/*-dev branch commits
|
||||
# 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine
|
||||
# 3. only pep440 matched tag will be published:
|
||||
# - v0.7.1 --> v0.7.1-openeuler, latest
|
||||
# - v0.7.1 --> v0.7.1-openeuler
|
||||
# - pre/post/dev: v0.7.1rc1-openeuler/v0.7.1rc1-openeuler/v0.7.1rc1.dev1-openeuler/v0.7.1.post1-openeuler, no latest
|
||||
# which follow the rule from vLLM with prefix v
|
||||
# TODO(yikun): the post release might be considered as latest release
|
||||
|
@ -73,6 +72,8 @@ jobs:
|
|||
type=ref,event=branch,suffix=-openeuler
|
||||
type=ref,event=pr,suffix=-openeuler
|
||||
type=pep440,pattern={{raw}},suffix=-openeuler
|
||||
flavor:
|
||||
latest=true
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
|
@ -112,3 +113,4 @@ jobs:
|
|||
file: Dockerfile.openEuler
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
||||
|
|
|
@ -9,7 +9,7 @@ name: 'image / Ubuntu'
|
|||
# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev
|
||||
# 3. tags push trigger image publish
|
||||
# - is for final release image
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3|latest / vllm-ascend:v1.2.3rc1
|
||||
# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3 / vllm-ascend:v1.2.3rc1
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
|
@ -69,6 +69,8 @@ jobs:
|
|||
type=ref,event=branch
|
||||
type=ref,event=pr
|
||||
type=pep440,pattern={{raw}}
|
||||
flavor:
|
||||
latest=true
|
||||
|
||||
- name: Free up disk space
|
||||
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
|
||||
|
@ -108,3 +110,4 @@ jobs:
|
|||
tags: ${{ steps.meta.outputs.tags }}
|
||||
build-args: |
|
||||
PIP_INDEX_URL=https://pypi.org/simple
|
||||
provenance: false
|
||||
|
|
|
@ -20,9 +20,10 @@ name: 'Benchmarks / Performance'
|
|||
|
||||
on:
|
||||
schedule:
|
||||
# Run at 02:00 everyday
|
||||
- cron: '00 18 * * *'
|
||||
|
||||
# Run benchmarks at 20:00 and 03:00 Beijing time (UTC+8)
|
||||
- cron: "0 12 * * *"
|
||||
- cron: "0 19 * * *"
|
||||
|
||||
workflow_dispatch:
|
||||
# Allow manual triggering of the workflow
|
||||
|
||||
|
@ -81,6 +82,8 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
# keep using tuna's proxy since linux-arm64-npu-static-8 is in another region
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
- name: Install system dependencies
|
||||
|
|
|
@ -75,7 +75,9 @@ jobs:
|
|||
echo "Replacing /vllm-workspace/vllm-ascend/tests/e2e ..."
|
||||
rm -rf /vllm-workspace/vllm-ascend/tests/e2e
|
||||
mkdir -p /vllm-workspace/vllm-ascend/tests
|
||||
# Overwrite e2e and examples
|
||||
cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/
|
||||
cp -r examples /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Simulate container to enter directory
|
||||
cd /workspace
|
||||
|
|
|
@ -41,39 +41,38 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
lint:
|
||||
# Only trigger lint on pull request
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
uses: ./.github/workflows/pre-commit.yml
|
||||
|
||||
changes:
|
||||
# Only trigger changes on pull request
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read
|
||||
outputs:
|
||||
e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
|
||||
ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
|
||||
steps:
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
e2e_tracker:
|
||||
- 'vllm_ascend/**'
|
||||
- 'csrc/**'
|
||||
- 'cmake/**'
|
||||
- 'tests/e2e/**'
|
||||
- 'tests/conftest.py'
|
||||
- 'tests/model_utils.py'
|
||||
- 'tests/utils.py'
|
||||
ut_tracker:
|
||||
- 'tests/ut/**'
|
||||
- uses: actions/checkout@v4
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
e2e_tracker:
|
||||
- '.github/workflows/vllm_ascend_test.yaml'
|
||||
- 'vllm_ascend/**'
|
||||
- 'csrc/**'
|
||||
- 'cmake/**'
|
||||
- 'tests/e2e/**'
|
||||
- 'CMakeLists.txt'
|
||||
- 'setup.py'
|
||||
- 'requirements.txt'
|
||||
- 'requirements-dev.txt'
|
||||
- 'requirements-lint.txt'
|
||||
- 'packages.txt'
|
||||
ut_tracker:
|
||||
- 'tests/ut/**'
|
||||
ut:
|
||||
needs: [lint, changes]
|
||||
name: unit test
|
||||
# only trigger unit test after lint passed and the change is e2e and ut related. Or the PR is merged.
|
||||
if: ${{ github.event_name == 'push' || (needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true')) }}
|
||||
# only trigger unit test after lint passed and the change is e2e and ut related.
|
||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
|
||||
|
@ -112,9 +111,8 @@ jobs:
|
|||
python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
|
||||
python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
|
||||
|
||||
- name: Run unit test for V1 Engine
|
||||
- name: Run unit test
|
||||
env:
|
||||
VLLM_USE_V1: 1
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
TORCH_DEVICE_BACKEND_AUTOLOAD: 0
|
||||
run: |
|
||||
|
@ -133,8 +131,8 @@ jobs:
|
|||
|
||||
e2e:
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related.
|
||||
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
strategy:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
|
@ -156,11 +154,11 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
||||
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
||||
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
||||
apt-get update -y
|
||||
apt install git -y
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
|
||||
- name: Checkout vllm-project/vllm-ascend repo
|
||||
uses: actions/checkout@v4
|
||||
|
@ -189,9 +187,8 @@ jobs:
|
|||
pip install -r requirements-dev.txt
|
||||
pip install -v -e .
|
||||
|
||||
- name: Run e2e test for V1 Engine
|
||||
- name: Run e2e test
|
||||
env:
|
||||
VLLM_USE_V1: 1
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
run: |
|
||||
|
@ -213,26 +210,6 @@ jobs:
|
|||
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
|
||||
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
|
||||
|
||||
- name: Run e2e test on V0 engine
|
||||
if: ${{ github.event_name == 'schedule' }}
|
||||
env:
|
||||
VLLM_USE_V1: 0
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
run: |
|
||||
pytest -sv tests/e2e/singlecard/test_offline_inference.py
|
||||
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
||||
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||
pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
|
||||
pytest -sv tests/e2e/singlecard/test_embedding.py
|
||||
pytest -sv tests/e2e/singlecard/ \
|
||||
--ignore=tests/e2e/singlecard/test_offline_inference.py \
|
||||
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
|
||||
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
|
||||
--ignore=tests/e2e/singlecard/test_camem.py \
|
||||
--ignore=tests/e2e/singlecard/test_prompt_embedding.py \
|
||||
--ignore=tests/e2e/singlecard/test_embedding.py
|
||||
|
||||
e2e-4-cards:
|
||||
needs: [e2e]
|
||||
if: ${{ needs.e2e.result == 'success' }}
|
||||
|
@ -257,11 +234,11 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
||||
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
||||
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
||||
apt-get update -y
|
||||
apt install git -y
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
|
||||
- name: Checkout vllm-project/vllm-ascend repo
|
||||
uses: actions/checkout@v4
|
||||
|
@ -290,9 +267,8 @@ jobs:
|
|||
pip install -r requirements-dev.txt
|
||||
pip install -v -e .
|
||||
|
||||
- name: Run vllm-project/vllm-ascend test for V1 Engine
|
||||
- name: Run vllm-project/vllm-ascend test
|
||||
env:
|
||||
VLLM_USE_V1: 1
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
run: |
|
||||
|
@ -308,19 +284,3 @@ jobs:
|
|||
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
|
||||
--ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
|
||||
--ignore=tests/e2e/multicard/test_data_parallel.py
|
||||
|
||||
- name: Run vllm-project/vllm-ascend test on V0 engine
|
||||
if: ${{ github.event_name == 'schedule' }}
|
||||
env:
|
||||
VLLM_USE_V1: 0
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
run: |
|
||||
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
|
||||
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
|
||||
# To avoid oom, we need to run the test in a single process.
|
||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
|
||||
pytest -sv tests/e2e/multicard/test_data_parallel.py
|
||||
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
|
||||
--ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
|
||||
--ignore=tests/e2e/multicard/test_data_parallel.py
|
||||
|
|
|
@ -60,11 +60,11 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
||||
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
||||
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
||||
apt-get update -y
|
||||
apt install git -y
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
|
||||
- name: Checkout vllm-project/vllm-ascend repo
|
||||
uses: actions/checkout@v4
|
||||
|
|
|
@ -73,6 +73,7 @@ jobs:
|
|||
|
||||
- name: Config mirrors
|
||||
run: |
|
||||
# keep using tuna's proxy since linux-arm64-npu-static-8 is in another region
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
apt-get update -y
|
||||
|
|
|
@ -198,3 +198,7 @@ kernel_meta/
|
|||
/vllm_ascend/_version.py
|
||||
# build info file generated by setup.py
|
||||
/vllm_ascend/_build_info.py
|
||||
/vllm_ascend/include/
|
||||
|
||||
# generated by CANN
|
||||
fusion_result.json
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# Contributing to vLLM Ascend
|
||||
|
||||
You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally.
|
|
@ -0,0 +1,60 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.1.rc1-a3-ubuntu22.04-py3.10
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope ray && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
|
@ -0,0 +1,57 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.1.rc1-a3-openeuler22.03-py3.10
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope ray && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
11
README.md
11
README.md
|
@ -19,6 +19,10 @@ vLLM Ascend Plugin
|
|||
|
||||
---
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with LLaMA-Factory/verl//TRL/GPUStack to demonstrate how vLLM Ascend assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
|
||||
- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
|
||||
- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
|
||||
- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
|
||||
- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
|
||||
- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
|
||||
|
@ -43,7 +47,12 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
|
|||
|
||||
## Getting Started
|
||||
|
||||
Please refer to [QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details.
|
||||
Please use the following recommended versions to get started quickly:
|
||||
|
||||
| Version | Release type | Doc |
|
||||
|------------|--------------|--------------------------------------|
|
||||
|v0.9.2rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|
||||
|v0.7.3.post1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/stable/installation.html) for more details|
|
||||
|
||||
## Contributing
|
||||
See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
|
||||
|
|
10
README.zh.md
10
README.zh.md
|
@ -20,6 +20,9 @@ vLLM Ascend Plugin
|
|||
---
|
||||
*最新消息* 🔥
|
||||
|
||||
- [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线!展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例,展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。
|
||||
- [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线!所有的贡献都值得被记录,感谢所有的贡献者。
|
||||
- [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)!我们与 vLLM 社区合作发布了一篇博客文章,分享了我们的实践:[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。
|
||||
- [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料.
|
||||
- [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库,让vLLM可以无缝运行在Ascend NPU。
|
||||
- [2024/12] 我们正在与 vLLM 社区合作,以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
|
||||
|
@ -44,7 +47,12 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
|
|||
|
||||
## 开始使用
|
||||
|
||||
请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多.
|
||||
推荐您使用以下版本快速开始使用:
|
||||
|
||||
| Version | Release type | Doc |
|
||||
|------------|--------------|--------------------------------------|
|
||||
|v0.9.2rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|
||||
|v0.7.3.post1| 最新正式/稳定版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/stable/installation.html)了解更多|
|
||||
|
||||
## 贡献
|
||||
请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
pandas
|
||||
datasets
|
||||
modelscope
|
||||
libcst
|
||||
tabulate
|
|
@ -1,79 +0,0 @@
|
|||
import os
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import libcst as cst
|
||||
import libcst.matchers as m
|
||||
|
||||
# Patch the benchmark_dataset.py file to set streaming=False in load_dataset calls
|
||||
|
||||
|
||||
# TODO(Potabk): Remove this patch when the issue is fixed in the upstream
|
||||
class StreamingFalseTransformer(cst.CSTTransformer):
|
||||
def __init__(self):
|
||||
self.in_target_class = False
|
||||
self.in_target_func = False
|
||||
|
||||
def visit_ClassDef(self, node):
|
||||
if node.name.value == "HuggingFaceDataset":
|
||||
self.in_target_class = True
|
||||
|
||||
def leave_ClassDef(self, original_node, updated_node):
|
||||
self.in_target_class = False
|
||||
return updated_node
|
||||
|
||||
def visit_FunctionDef(self, node):
|
||||
if self.in_target_class and node.name.value == "load_data":
|
||||
self.in_target_func = True
|
||||
|
||||
def leave_FunctionDef(self, original_node, updated_node):
|
||||
self.in_target_func = False
|
||||
return updated_node
|
||||
|
||||
def leave_Call(self, original_node, updated_node):
|
||||
if self.in_target_class and self.in_target_func:
|
||||
if m.matches(updated_node.func, m.Name("load_dataset")):
|
||||
new_args = []
|
||||
for arg in updated_node.args:
|
||||
if arg.keyword and arg.keyword.value == "streaming":
|
||||
new_arg = arg.with_changes(value=cst.Name("False"))
|
||||
new_args.append(new_arg)
|
||||
else:
|
||||
new_args.append(arg)
|
||||
return updated_node.with_changes(args=new_args)
|
||||
return updated_node
|
||||
|
||||
|
||||
def patch_file(path):
|
||||
abs_path = os.path.abspath(path)
|
||||
|
||||
if not os.path.exists(abs_path):
|
||||
print(f"File not found: {abs_path}")
|
||||
return
|
||||
|
||||
with open(abs_path, "r", encoding="utf-8") as f:
|
||||
source = f.read()
|
||||
|
||||
module = cst.parse_module(source)
|
||||
modified = module.visit(StreamingFalseTransformer())
|
||||
|
||||
with open(abs_path, "w", encoding="utf-8") as f:
|
||||
f.write(modified.code)
|
||||
|
||||
print(f"Patched: {abs_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser(
|
||||
description="Patch benchmark_dataset.py to set streaming=False in load_dataset calls"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--path", type=str, help="Path to the benchmark_dataset.py file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--path",
|
||||
type=str,
|
||||
default="/vllm-workspace/vllm/vllm/benchmarks/datasets.py",
|
||||
help="Path to the benchmark_dataset.py file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
patch_file(args.path)
|
|
@ -281,7 +281,6 @@ cleanup_on_error() {
|
|||
main() {
|
||||
START_TIME=$(date +%s)
|
||||
check_npus
|
||||
python3 benchmarks/scripts/patch_benchmark_dataset.py
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
|
|
@ -1,241 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) China Merchants Bank Co., Ltd. 2025. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "kernel_operator.h"
|
||||
constexpr int32_t BUFFER_NUM = 1;
|
||||
class KernelAdvanceStep{
|
||||
public:
|
||||
__aicore__ inline KernelAdvanceStep() {}
|
||||
__aicore__ inline void Init(int32_t tasks_per_core,
|
||||
int32_t num_queries,
|
||||
__gm__ int64_t* input_tokens_ptr,
|
||||
__gm__ int64_t* sampled_token_ids_ptr,
|
||||
__gm__ int64_t* input_positions_ptr,
|
||||
__gm__ int32_t* seq_lens_ptr,
|
||||
__gm__ int32_t* slot_mapping_ptr)
|
||||
{
|
||||
this->tasks_per_core = tasks_per_core;
|
||||
|
||||
this->start_id = this->tasks_per_core * AscendC::GetBlockIdx();
|
||||
this->end_id = this->tasks_per_core * (AscendC::GetBlockIdx() + 1) - 1;
|
||||
|
||||
// actual task nums of each core
|
||||
this->actual_task_per_core = tasks_per_core;
|
||||
if(this->end_id >= num_queries) {
|
||||
this->actual_task_per_core = num_queries - this->start_id;
|
||||
this->end_id = num_queries - 1;
|
||||
}
|
||||
|
||||
int32_t offset_this_core = this->tasks_per_core * AscendC::GetBlockIdx();
|
||||
|
||||
// init outQues
|
||||
pipe.InitBuffer(outQueInputTokens, BUFFER_NUM, this->actual_task_per_core * sizeof(int64_t));
|
||||
pipe.InitBuffer(outQueInputPos, BUFFER_NUM, this->actual_task_per_core * sizeof(int64_t));
|
||||
pipe.InitBuffer(outQueSeqLen, BUFFER_NUM, this->actual_task_per_core * sizeof(int32_t));
|
||||
pipe.InitBuffer(outQueSlotMapping, BUFFER_NUM, this->actual_task_per_core * sizeof(int32_t));
|
||||
|
||||
// init inQues
|
||||
pipe.InitBuffer(inQueSeqLen,BUFFER_NUM, this->actual_task_per_core * sizeof(int32_t));
|
||||
pipe.InitBuffer(inQueSampledTokenIds,BUFFER_NUM, this->actual_task_per_core * sizeof(int64_t));
|
||||
|
||||
// init GlobalMemory
|
||||
inputTokensGm.SetGlobalBuffer((__gm__ int64_t *)input_tokens_ptr + offset_this_core, this->actual_task_per_core);
|
||||
sampledTokenIdsGm.SetGlobalBuffer((__gm__ int64_t *)sampled_token_ids_ptr + offset_this_core, this->actual_task_per_core);
|
||||
inputPositionsGm.SetGlobalBuffer((__gm__ int64_t *)input_positions_ptr + offset_this_core, this->actual_task_per_core);
|
||||
seqLensGm.SetGlobalBuffer((__gm__ int32_t *)seq_lens_ptr + offset_this_core, this->actual_task_per_core);
|
||||
slotMappingGm.SetGlobalBuffer((__gm__ int32_t *)slot_mapping_ptr + offset_this_core, this->actual_task_per_core);
|
||||
}
|
||||
__aicore__ inline void Process(int64_t block_size, __gm__ int32_t* block_tables_ptr, int64_t block_tables_stride)
|
||||
{
|
||||
// no need for tilling or pipeline parallel within each core, as the amount of data processed is very small
|
||||
CopyIn();
|
||||
Update(block_size, block_tables_ptr, block_tables_stride);
|
||||
CopyOut();
|
||||
}
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn()
|
||||
{
|
||||
AscendC::LocalTensor<int32_t> seqLenLocalIn = inQueSeqLen.AllocTensor<int32_t>();
|
||||
AscendC::LocalTensor<int64_t> sampledTokenIdsLocal = inQueSampledTokenIds.AllocTensor<int64_t>();
|
||||
|
||||
AscendC::DataCopyExtParams copyParams32{1, static_cast<uint32_t>(this->actual_task_per_core * sizeof(int32_t)), 0, 0, 0}; // blockLen = tasks_per_core * 32 / 8 个字节(int32为4字节)
|
||||
AscendC::DataCopyExtParams copyParams64{1, static_cast<uint32_t>(this->actual_task_per_core * sizeof(int64_t)), 0, 0, 0}; // blockLen = tasks_per_core * 64 / 8 个字节(int64为8字节)
|
||||
|
||||
// calculate the nums that need padded
|
||||
// so that the total length becomes a multiple of 32 bytes which is a requirement of DataCopy Function.
|
||||
uint8_t remainNum32 =this->actual_task_per_core * sizeof(int32_t) % 32;
|
||||
uint8_t needPadElements32 = remainNum32 == 0 ? remainNum32 : (32 - remainNum32) / sizeof(int32_t);
|
||||
|
||||
AscendC::DataCopyPadExtParams<int32_t> padParams32{true, 0, needPadElements32, 0};
|
||||
|
||||
// calculate the nums that need padded
|
||||
// so that the total length becomes a multiple of 32 bytes which is a requirement of DataCopy Function.
|
||||
uint8_t remainNum64 =this->actual_task_per_core * sizeof(int64_t) % 32;
|
||||
uint8_t needPadElements64 = remainNum64 == 0 ? remainNum64 : (32 - remainNum64) / sizeof(int64_t);
|
||||
AscendC::DataCopyPadExtParams<int64_t> padParams64{true, 0, needPadElements64, 0};
|
||||
|
||||
AscendC::DataCopyPad(seqLenLocalIn, seqLensGm, copyParams32, padParams32);
|
||||
AscendC::DataCopyPad(sampledTokenIdsLocal, sampledTokenIdsGm, copyParams64, padParams64);
|
||||
|
||||
inQueSeqLen.EnQue(seqLenLocalIn);
|
||||
inQueSampledTokenIds.EnQue(sampledTokenIdsLocal);
|
||||
}
|
||||
__aicore__ inline void Update(int64_t block_size, __gm__ int32_t* block_tables_ptr, int64_t block_tables_stride)
|
||||
{
|
||||
// input
|
||||
AscendC::LocalTensor<int32_t> seqLenLocalIn = inQueSeqLen.DeQue<int32_t>();
|
||||
AscendC::LocalTensor<int64_t> sampledTokenIdsLocal = inQueSampledTokenIds.DeQue<int64_t>();
|
||||
|
||||
// output
|
||||
AscendC::LocalTensor<int64_t> inputTokensLocal = outQueInputTokens.AllocTensor<int64_t>();
|
||||
AscendC::LocalTensor<int64_t> inputPosLocal = outQueInputPos.AllocTensor<int64_t>();
|
||||
AscendC::LocalTensor<int32_t> seqLenLocalOut = outQueSeqLen.AllocTensor<int32_t>();
|
||||
AscendC::LocalTensor<int32_t> slotMappingLocal = outQueSlotMapping.AllocTensor<int32_t>();
|
||||
|
||||
auto unary_params = AscendC::UnaryRepeatParams(1, 1, 8, 8);
|
||||
|
||||
//Use "for" instead of AscendC::Adds function because AscendC::Adds does not work
|
||||
//when srcLocalMemory has different datatype from dstLocalMemory
|
||||
for(int i=0; i < this->actual_task_per_core; i++) {
|
||||
inputTokensLocal.SetValue(i, sampledTokenIdsLocal.GetValue(i));
|
||||
inputPosLocal.SetValue(i, seqLenLocalIn.GetValue(i));
|
||||
}
|
||||
|
||||
AscendC::Adds<int32_t, false>(seqLenLocalOut, seqLenLocalIn, 1, (uint64_t)0, 1, unary_params);
|
||||
|
||||
// Gather blockTables with dim=1, block_index. No Ascend Function available, use "for" instead.
|
||||
for(int cur_query_id = this->start_id, i = 0; i < this->actual_task_per_core; cur_query_id++, i++) {
|
||||
__gm__ int32_t const* seq_block_tables_ptr = block_tables_ptr + block_tables_stride * cur_query_id;
|
||||
|
||||
int block_index = inputPosLocal.GetValue(i) / block_size;
|
||||
int block_offset = inputPosLocal.GetValue(i) % block_size;
|
||||
|
||||
int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
|
||||
// Update slot_mapping
|
||||
slotMappingLocal.SetValue(i,slot_num);
|
||||
}
|
||||
|
||||
outQueInputTokens.EnQue(inputTokensLocal);
|
||||
outQueInputPos.EnQue(inputPosLocal);
|
||||
outQueSeqLen.EnQue(seqLenLocalOut);
|
||||
outQueSlotMapping.EnQue(slotMappingLocal);
|
||||
|
||||
inQueSampledTokenIds.FreeTensor(sampledTokenIdsLocal);
|
||||
inQueSeqLen.FreeTensor(seqLenLocalIn);
|
||||
|
||||
}
|
||||
__aicore__ inline void CopyOut()
|
||||
{
|
||||
AscendC::DataCopyExtParams copyParams32{1, static_cast<uint32_t>(this->actual_task_per_core * sizeof(int32_t)),0,0,0};
|
||||
AscendC::DataCopyExtParams copyParams64{1, static_cast<uint32_t>(this->actual_task_per_core * sizeof(int64_t)),0,0,0};
|
||||
|
||||
AscendC::LocalTensor<int64_t> inputTokensLocal = outQueInputTokens.DeQue<int64_t>();
|
||||
AscendC::DataCopyPad(inputTokensGm, inputTokensLocal, copyParams64);
|
||||
outQueInputTokens.FreeTensor(inputTokensLocal);
|
||||
|
||||
AscendC::LocalTensor<int64_t> inputPosLocal = outQueInputPos.DeQue<int64_t>();
|
||||
AscendC::DataCopyPad(inputPositionsGm, inputPosLocal, copyParams64);
|
||||
outQueInputPos.FreeTensor(inputPosLocal);
|
||||
|
||||
AscendC::LocalTensor<int32_t> seqLenLocalOut = outQueSeqLen.DeQue<int32_t>();
|
||||
AscendC::DataCopyPad(seqLensGm, seqLenLocalOut, copyParams32);
|
||||
outQueSeqLen.FreeTensor(seqLenLocalOut);
|
||||
|
||||
AscendC::LocalTensor<int32_t> slotMappingLocal = outQueSlotMapping.DeQue<int32_t>();
|
||||
AscendC::DataCopyPad(slotMappingGm, slotMappingLocal, copyParams32);
|
||||
outQueSlotMapping.FreeTensor(slotMappingLocal);
|
||||
}
|
||||
|
||||
private:
|
||||
AscendC::TPipe pipe;
|
||||
AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueInputTokens, outQueInputPos,
|
||||
outQueSeqLen, outQueSlotMapping;
|
||||
AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueSeqLen,
|
||||
inQueSampledTokenIds,
|
||||
inQueBlockTables;
|
||||
|
||||
AscendC::GlobalTensor<int64_t> inputTokensGm, sampledTokenIdsGm, inputPositionsGm ;
|
||||
|
||||
AscendC::GlobalTensor<int32_t> seqLensGm, slotMappingGm, blockTablesGm;
|
||||
|
||||
int32_t tasks_per_core, start_id, end_id, actual_task_per_core;
|
||||
};
|
||||
|
||||
extern "C" __global__ __aicore__ void AdvanceStepFlashAttnKernel(
|
||||
int64_t num_seqs,
|
||||
int64_t num_queries,
|
||||
int64_t block_size,
|
||||
__gm__ int64_t* input_tokens_ptr,
|
||||
__gm__ int64_t* sampled_token_ids_ptr,
|
||||
__gm__ int64_t* input_positions_ptr,
|
||||
__gm__ int32_t* seq_lens_ptr,
|
||||
__gm__ int32_t* slot_mapping_ptr,
|
||||
__gm__ int32_t* block_tables_ptr,
|
||||
int64_t block_tables_stride,
|
||||
int32_t tasks_per_core
|
||||
)
|
||||
{
|
||||
int start_id = tasks_per_core * AscendC::GetBlockIdx();
|
||||
// no task for this core.
|
||||
if(start_id >= num_queries) {
|
||||
return;
|
||||
}
|
||||
KernelAdvanceStep advanceStep;
|
||||
advanceStep.Init(tasks_per_core, num_queries, input_tokens_ptr, sampled_token_ids_ptr, input_positions_ptr, seq_lens_ptr, slot_mapping_ptr);
|
||||
advanceStep.Process(block_size,block_tables_ptr,block_tables_stride);
|
||||
}
|
||||
|
||||
namespace vllm_ascend
|
||||
{
|
||||
|
||||
extern void launch_advance_step_flashattn(
|
||||
void* stream,
|
||||
int64_t num_seqs,
|
||||
int64_t num_queries,
|
||||
int64_t block_size,
|
||||
int64_t* input_tokens_ptr,
|
||||
int64_t* sampled_token_ids_ptr,
|
||||
int64_t* input_positions_ptr,
|
||||
int32_t* seq_lens_ptr,
|
||||
int32_t* slot_mapping_ptr,
|
||||
int32_t* block_tables_ptr,
|
||||
int64_t block_tables_stride)
|
||||
{
|
||||
int32_t num_cores = 20;
|
||||
|
||||
if(num_cores > num_queries) {
|
||||
num_cores = num_queries;
|
||||
}
|
||||
|
||||
// task num processed of each core
|
||||
int32_t tasks_per_core = (num_queries + num_cores - 1) / num_cores;
|
||||
|
||||
AdvanceStepFlashAttnKernel<<<num_cores, nullptr, stream>>>(
|
||||
num_seqs,
|
||||
num_queries,
|
||||
block_size,
|
||||
input_tokens_ptr,
|
||||
sampled_token_ids_ptr,
|
||||
input_positions_ptr,
|
||||
seq_lens_ptr,
|
||||
slot_mapping_ptr,
|
||||
block_tables_ptr,
|
||||
block_tables_stride,
|
||||
tasks_per_core);
|
||||
}
|
||||
|
||||
}
|
12
csrc/ops.h
12
csrc/ops.h
|
@ -60,16 +60,4 @@ namespace vllm_ascend {
|
|||
auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
|
||||
return new_tensor;
|
||||
}
|
||||
extern void launch_advance_step_flashattn(
|
||||
void* stream,
|
||||
int64_t num_seqs,
|
||||
int64_t num_queries,
|
||||
int64_t block_size,
|
||||
int64_t* input_tokens_ptr,
|
||||
int64_t* sampled_token_ids_ptr,
|
||||
int64_t* input_positions_ptr,
|
||||
int32_t* seq_lens_ptr,
|
||||
int32_t* slot_mapping_ptr,
|
||||
int32_t* block_tables_ptr,
|
||||
int64_t block_tables_stride);
|
||||
}
|
||||
|
|
|
@ -204,87 +204,6 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
|
|||
cmd.Run();
|
||||
return {masked_input, mask};
|
||||
}
|
||||
|
||||
void verify_tensor(std::string const& name, at::Tensor const& t,
|
||||
int64_t const size_0, int64_t const size_1,
|
||||
c10::ScalarType const type) {
|
||||
bool size_0_cond = true;
|
||||
if (size_0 != -1) {
|
||||
size_0_cond = t.size(0) == size_0;
|
||||
}
|
||||
|
||||
bool size_1_cond = true;
|
||||
if (size_1 != -1) {
|
||||
size_1_cond = t.size(1) == size_1;
|
||||
}
|
||||
|
||||
bool is_contiguous = t.is_contiguous();
|
||||
bool same_type = t.dtype() == type;
|
||||
|
||||
bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
|
||||
if (!pass) {
|
||||
TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
|
||||
" is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
|
||||
" is not as expected: shape = [", size_0, ", ", size_1,
|
||||
"], type = ", type);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void advance_step_flashattn_ascendc(
|
||||
int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
at::Tensor& input_tokens,
|
||||
at::Tensor& sampled_token_ids,
|
||||
at::Tensor& input_positions,
|
||||
at::Tensor& seq_lens,
|
||||
at::Tensor& slot_mapping,
|
||||
at::Tensor& block_tables
|
||||
){
|
||||
// Verify all tensors
|
||||
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
||||
verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,at::kLong);
|
||||
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
|
||||
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
|
||||
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kInt);
|
||||
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
|
||||
|
||||
|
||||
int64_t* input_tokens_ptr = input_tokens.data_ptr<int64_t>();
|
||||
int64_t* sampled_token_ids_ptr = sampled_token_ids.data_ptr<int64_t>();
|
||||
int64_t* input_positions_ptr = input_positions.data_ptr<int64_t>();
|
||||
int32_t* seq_lens_ptr = seq_lens.data_ptr<int32_t>();
|
||||
int32_t* slot_mapping_ptr = slot_mapping.data_ptr<int32_t>();
|
||||
int32_t* block_tables_ptr = block_tables.data_ptr<int32_t>();
|
||||
|
||||
|
||||
int32_t device_id;
|
||||
aclrtGetDevice(&device_id);
|
||||
auto npu_stream = c10_npu::getCurrentNPUStream(device_id);
|
||||
aclrtStream stream = npu_stream.stream();
|
||||
|
||||
// aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
|
||||
at_npu::native::OpCommand cmd;
|
||||
cmd.Name("advance_step_flashattn_ascendc");
|
||||
cmd.SetCustomHandler([stream, num_seqs, num_queries,
|
||||
block_size, input_tokens_ptr, sampled_token_ids_ptr,
|
||||
input_positions_ptr, seq_lens_ptr, slot_mapping_ptr,
|
||||
block_tables_ptr, block_tables]() -> int {
|
||||
launch_advance_step_flashattn(stream,
|
||||
num_seqs,
|
||||
num_queries,
|
||||
block_size,
|
||||
input_tokens_ptr,
|
||||
sampled_token_ids_ptr,
|
||||
input_positions_ptr,
|
||||
seq_lens_ptr,
|
||||
slot_mapping_ptr,
|
||||
block_tables_ptr,
|
||||
block_tables.stride(0));
|
||||
return 0;
|
||||
});
|
||||
cmd.Run();
|
||||
return ;
|
||||
}
|
||||
} // namespace vllm_ascend
|
||||
|
||||
TORCH_LIBRARY_EXPAND(_C, ops)
|
||||
|
@ -309,12 +228,6 @@ TORCH_LIBRARY_EXPAND(_C, ops)
|
|||
" int added_vocab_start_index, "
|
||||
" int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)");
|
||||
ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
|
||||
|
||||
ops.def(
|
||||
"advance_step_flashattn_ascendc(int num_seqs, int num_queries, int block_size,"
|
||||
" Tensor! input_tokens, Tensor! sampled_token_ids, Tensor! input_positions,"
|
||||
" Tensor! seq_lens, Tensor! slot_mapping, Tensor! block_tables) -> ()");
|
||||
ops.impl("advance_step_flashattn_ascendc", torch::kPrivateUse1, &vllm_ascend::advance_step_flashattn_ascendc);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(_C)
|
||||
|
|
|
@ -163,3 +163,7 @@ export HCCL_DETERMINISTIC = 1
|
|||
export ATB_MATMUL_SHUFFLE_K_ENABLE = 0
|
||||
export ATB_LLM_LCOC_ENABLE = 0
|
||||
```
|
||||
|
||||
### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model?
|
||||
The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`,
|
||||
this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly.
|
||||
|
|
|
@ -56,9 +56,10 @@ hccn_tool -i 0 -ping -g address 10.20.0.20
|
|||
## Run with docker
|
||||
Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3-w8a8` quantitative model across multi-node.
|
||||
|
||||
```shell
|
||||
# Define the image and container name
|
||||
export IMAGE=quay.io/ascend/vllm-ascend:main
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
# Update the vllm-ascend image
|
||||
export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version|
|
||||
export NAME=vllm-ascend
|
||||
|
||||
# Run the container using the defined variables
|
||||
|
@ -111,7 +112,7 @@ export OMP_NUM_THREADS=100
|
|||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3-W8A8
|
||||
# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/quantization.html
|
||||
# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html
|
||||
vllm serve /root/.cache/ds_v3 \
|
||||
--host 0.0.0.0 \
|
||||
--port 8004 \
|
||||
|
|
|
@ -54,7 +54,11 @@ vllm serve /path/to/pangu-pro-moe-model \
|
|||
|
||||
Once your server is started, you can query the model with input prompts:
|
||||
|
||||
```bash
|
||||
:::::{tab-set}
|
||||
::::{tab-item} v1/completions
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
export question="你是谁?"
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
|
@ -66,6 +70,28 @@ curl http://localhost:8000/v1/completions \
|
|||
"temperature": 0.6
|
||||
}'
|
||||
```
|
||||
::::
|
||||
|
||||
::::{tab-item} v1/chat/completions
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "system", "content": ""},
|
||||
{"role": "user", "content": "你是谁?"}
|
||||
],
|
||||
"max_tokens": "64",
|
||||
"top_p": "0.95",
|
||||
"top_k": "50",
|
||||
"temperature": "0.6",
|
||||
"add_special_tokens" : true
|
||||
}'
|
||||
```
|
||||
::::
|
||||
:::::
|
||||
|
||||
If you run this successfully, you can see the info shown below:
|
||||
|
||||
|
@ -77,15 +103,21 @@ If you run this successfully, you can see the info shown below:
|
|||
|
||||
Run the following script to execute offline inference on multi-NPU:
|
||||
|
||||
```python
|
||||
:::::{tab-set}
|
||||
::::{tab-item} Graph Mode
|
||||
|
||||
```{code-block} python
|
||||
:substitutions:
|
||||
import gc
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (destroy_distributed_environment,
|
||||
destroy_model_parallel)
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
def clean_up():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
|
@ -106,7 +138,72 @@ if __name__ == "__main__":
|
|||
{"role": "system", "content": ""}, # Optionally customize system content
|
||||
{"role": "user", "content": text}
|
||||
]
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 推荐使用官方的template
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
prompts.append(prompt)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)
|
||||
|
||||
llm = LLM(model="/path/to/pangu-pro-moe-model",
|
||||
tensor_parallel_size=4,
|
||||
distributed_executor_backend="mp",
|
||||
max_model_len=1024,
|
||||
trust_remote_code=True,
|
||||
additional_config={
|
||||
'torchair_graph_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
'ascend_scheduler_config':{
|
||||
'enabled': True,
|
||||
'enable_chunked_prefill' : False,
|
||||
'chunked_prefill_enabled': False
|
||||
},
|
||||
})
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
del llm
|
||||
clean_up()
|
||||
```
|
||||
::::
|
||||
|
||||
::::{tab-item} Eager Mode
|
||||
```{code-block} python
|
||||
:substitutions:
|
||||
import gc
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (destroy_distributed_environment,
|
||||
destroy_model_parallel)
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
def clean_up():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("/path/to/pangu-pro-moe-model", trust_remote_code=True)
|
||||
tests = [
|
||||
"Hello, my name is",
|
||||
"The future of AI is",
|
||||
]
|
||||
prompts = []
|
||||
for text in tests:
|
||||
messages = [
|
||||
{"role": "system", "content": ""}, # Optionally customize system content
|
||||
{"role": "user", "content": text}
|
||||
]
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
prompts.append(prompt)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)
|
||||
|
@ -127,6 +224,8 @@ if __name__ == "__main__":
|
|||
del llm
|
||||
clean_up()
|
||||
```
|
||||
::::
|
||||
:::::
|
||||
|
||||
If you run this script successfully, you can see the info shown below:
|
||||
|
||||
|
|
|
@ -5,4 +5,4 @@ Like vLLM, vllm-ascend supports LoRA as well. The usage and more details can be
|
|||
You can also refer to [this](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) to find which models support LoRA in vLLM.
|
||||
|
||||
## Tips
|
||||
If you fail to run vllm-ascend with LoRA, you may follow [this instruction](https://vllm-ascend.readthedocs.io/en/latest/user_guide/graph_mode.html#fallback-to-eager-mode) to disable graph mode and try again.
|
||||
If you fail to run vllm-ascend with LoRA, you may follow [this instruction](https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html#fallback-to-eager-mode) to disable graph mode and try again.
|
||||
|
|
|
@ -8,7 +8,6 @@ This is the 1st release candidate of v0.9.2 for vLLM Ascend. Please follow the [
|
|||
- Pooling model works with V1 engine now. You can take a try with Qwen3 embedding model [#1359](https://github.com/vllm-project/vllm-ascend/pull/1359).
|
||||
- The performance on Atlas 300I series has been improved. [#1591](https://github.com/vllm-project/vllm-ascend/pull/1591)
|
||||
- aclgraph mode works with Moe models now. Currently, only Qwen3 Moe is well tested. [#1381](https://github.com/vllm-project/vllm-ascend/pull/1381)
|
||||
- Pipeline parallelism works with V1 Engine now. [#1700](https://github.com/vllm-project/vllm-ascend/pull/1700)
|
||||
|
||||
### Core
|
||||
- Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250619`. Don’t forget to update it in your environment. [#1347](https://github.com/vllm-project/vllm-ascend/pull/1347)
|
||||
|
@ -28,9 +27,6 @@ This is the 1st release candidate of v0.9.2 for vLLM Ascend. Please follow the [
|
|||
- DeepSeek now works with prefix cache now. [#1498](https://github.com/vllm-project/vllm-ascend/pull/1498)
|
||||
- Support prompt logprobs to recover ceval accuracy in V1 [#1483](https://github.com/vllm-project/vllm-ascend/pull/1483)
|
||||
|
||||
### Known Issues
|
||||
- Pipeline parallelism is not working on ray in this version. It'll be supported in the next release. [#1751](https://github.com/vllm-project/vllm-ascend/issues/1751)
|
||||
|
||||
## v0.9.1rc1 - 2025.06.22
|
||||
|
||||
This is the 1st release candidate of v0.9.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
|
||||
|
@ -81,7 +77,7 @@ This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [
|
|||
|
||||
### Highlights
|
||||
|
||||
- DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789)
|
||||
- DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789)
|
||||
- Qwen series models works with graph mode now. It works by default with V1 Engine. Please note that in this release, only Qwen series models are well tested with graph mode. We'll make it stable and generalize in the next release. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model.
|
||||
|
||||
### Core
|
||||
|
@ -162,7 +158,7 @@ We are excited to announce the release of 0.7.3 for vllm-ascend. This is the fir
|
|||
|
||||
## v0.8.5rc1 - 2025.05.06
|
||||
|
||||
This is the 1st release candidate of v0.8.5 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Now you can enable V1 egnine by setting the environment variable `VLLM_USE_V1=1`, see the feature support status of vLLM Ascend in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/suppoted_features.html).
|
||||
This is the 1st release candidate of v0.8.5 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Now you can enable V1 egnine by setting the environment variable `VLLM_USE_V1=1`, see the feature support status of vLLM Ascend in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html).
|
||||
|
||||
### Highlights
|
||||
- Upgrade CANN version to 8.1.RC1 to support chunked prefill and automatic prefix caching (`--enable_prefix_caching`) when V1 is enabled [#747](https://github.com/vllm-project/vllm-ascend/pull/747)
|
||||
|
@ -205,7 +201,7 @@ This is the second release candidate of v0.8.4 for vllm-ascend. Please follow th
|
|||
|
||||
## v0.8.4rc1 - 2025.04.18
|
||||
|
||||
This is the first release candidate of v0.8.4 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this version, vllm-ascend will follow the newest version of vllm and release every two weeks. For example, if vllm releases v0.8.5 in the next two weeks, vllm-ascend will release v0.8.5rc1 instead of v0.8.4rc2. Please find the detail from the [official documentation](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/versioning_policy.html#release-window).
|
||||
This is the first release candidate of v0.8.4 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this version, vllm-ascend will follow the newest version of vllm and release every two weeks. For example, if vllm releases v0.8.5 in the next two weeks, vllm-ascend will release v0.8.5rc1 instead of v0.8.4rc2. Please find the detail from the [official documentation](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#release-window).
|
||||
|
||||
### Highlights
|
||||
|
||||
|
|
|
@ -12,6 +12,9 @@ import os
|
|||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
kv_connector_extra_config = {
|
||||
"prefill_device_ips": ["1.2.3.1", "1.2.3.2"],
|
||||
"decode_device_ips": ["1.2.3.9", "1.2.3.10"],
|
||||
|
|
|
@ -13,6 +13,9 @@ import msgpack # type: ignore
|
|||
import zmq
|
||||
from quart import Quart, make_response, request
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
DP_PROXY_HTTP_PORT = 10004
|
||||
DP_PROXY_ZMQ_REG_PORT = 30006
|
||||
DP_PROXY_ZMQ_NOTIFY_PORT = 30005
|
||||
|
|
|
@ -8,6 +8,9 @@ import msgpack # type: ignore
|
|||
import zmq
|
||||
from quart import Quart, make_response, request
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
prefill_instances: dict[str, str] = {} # http_address: zmq_address
|
||||
decode_instances: dict[str, str] = {} # http_address: zmq_address
|
||||
|
||||
|
|
|
@ -8,6 +8,9 @@ import matplotlib.pyplot as plt # type: ignore
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
logger = logging.getLogger("msit_logger")
|
||||
|
||||
|
||||
|
|
|
@ -56,10 +56,18 @@ Multi-node:
|
|||
|
||||
import os
|
||||
from time import sleep
|
||||
import contextlib
|
||||
import gc
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
def parse_args():
|
||||
import argparse
|
||||
|
@ -107,6 +115,15 @@ def parse_args():
|
|||
return parser.parse_args()
|
||||
|
||||
|
||||
def cleanup_env_and_memory():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
def main(
|
||||
model,
|
||||
dp_size,
|
||||
|
@ -120,7 +137,6 @@ def main(
|
|||
trust_remote_code,
|
||||
):
|
||||
# DP only support on V1 engine
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
|
||||
os.environ["VLLM_DP_SIZE"] = str(dp_size)
|
||||
|
@ -183,8 +199,9 @@ def main(
|
|||
f"Generated text: {generated_text!r}")
|
||||
|
||||
# Give engines time to pause their processing loops before exiting.
|
||||
sleep(1)
|
||||
|
||||
sleep(5)
|
||||
del llm
|
||||
cleanup_env_and_memory()
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
|
|
@ -21,6 +21,8 @@ import os
|
|||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
def clean_up():
|
||||
import gc
|
||||
|
|
|
@ -17,28 +17,37 @@
|
|||
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
|
||||
#
|
||||
|
||||
import os
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
def main():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Generate texts from the prompts.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
# Generate texts from the prompts.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -3,9 +3,10 @@ import time
|
|||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
# enable dual-batch overlap for vllm ascend
|
||||
os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
# Sample prompts.
|
||||
prompts = ["The president of the United States is"] * 41
|
||||
|
|
|
@ -19,35 +19,40 @@
|
|||
|
||||
import os
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
|
||||
import torch
|
||||
from vllm import LLM
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
def get_detailed_instruct(task_description: str, query: str) -> str:
|
||||
return f'Instruct: {task_description}\nQuery:{query}'
|
||||
|
||||
|
||||
# Each query must come with a one-sentence instruction that describes the task
|
||||
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||
def main():
|
||||
# Each query must come with a one-sentence instruction that describes the task
|
||||
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||
|
||||
queries = [
|
||||
get_detailed_instruct(task, 'What is the capital of China?'),
|
||||
get_detailed_instruct(task, 'Explain gravity')
|
||||
]
|
||||
# No need to add instruction for retrieval documents
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
|
||||
]
|
||||
input_texts = queries + documents
|
||||
queries = [
|
||||
get_detailed_instruct(task, 'What is the capital of China?'),
|
||||
get_detailed_instruct(task, 'Explain gravity')
|
||||
]
|
||||
# No need to add instruction for retrieval documents
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
|
||||
]
|
||||
input_texts = queries + documents
|
||||
|
||||
model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
|
||||
model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
|
||||
|
||||
outputs = model.embed(input_texts)
|
||||
embeddings = torch.tensor([o.outputs.embedding for o in outputs])
|
||||
# Calculate the similarity scores between the first two queries and the last two documents
|
||||
scores = (embeddings[:2] @ embeddings[2:].T)
|
||||
print(scores.tolist())
|
||||
# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
|
||||
outputs = model.embed(input_texts)
|
||||
embeddings = torch.tensor([o.outputs.embedding for o in outputs])
|
||||
# Calculate the similarity scores between the first two queries and the last two documents
|
||||
scores = (embeddings[:2] @ embeddings[2:].T)
|
||||
print(scores.tolist())
|
||||
# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -24,9 +24,14 @@ For most models, the prompt format should follow corresponding examples
|
|||
on HuggingFace model repository.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
question_per_audio_count = {
|
||||
1: "What is recited in the audio?",
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
@ -13,32 +15,37 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
|
||||
#
|
||||
|
||||
# isort: skip_file
|
||||
import os
|
||||
|
||||
os.environ["VLLM_USE_V1"] = "0"
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
|
||||
def main():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Generate texts from the prompts.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
|
||||
|
||||
# Generate texts from the prompts.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -25,7 +25,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
def main():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
|
@ -48,3 +49,7 @@ if __name__ == "__main__":
|
|||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -22,11 +22,10 @@ import torch
|
|||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
if __name__ == "__main__":
|
||||
def main():
|
||||
prompt = "How are you?"
|
||||
|
||||
free, total = torch.npu.mem_get_info()
|
||||
|
@ -52,3 +51,7 @@ if __name__ == "__main__":
|
|||
output2 = llm.generate(prompt, sampling_params)
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 China Merchants Bank Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
"China is",
|
||||
]
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-0.5B",
|
||||
block_size=128,
|
||||
max_model_len=1024, # max length of prompt
|
||||
tensor_parallel_size=1, # number of NPUs to be used
|
||||
max_num_seqs=26, # max batch number
|
||||
enforce_eager=
|
||||
True, # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
|
||||
trust_remote_code=
|
||||
True, # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
|
||||
num_scheduler_steps=8,
|
||||
gpu_memory_utilization=0.5)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
@ -1,8 +1,13 @@
|
|||
import os
|
||||
|
||||
import torch
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizer)
|
||||
from vllm import LLM
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
def init_tokenizer_and_llm(model_name: str):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
export VLLM_USE_V1=1
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||
|
|
|
@ -19,7 +19,5 @@ requires = [
|
|||
"msgpack",
|
||||
"quart",
|
||||
"numba",
|
||||
# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470
|
||||
"transformers==4.52.4",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -12,4 +12,5 @@ xgrammar
|
|||
zmq
|
||||
types-psutil
|
||||
pytest-cov
|
||||
regex
|
||||
sentence_transformers
|
||||
|
|
|
@ -4,5 +4,6 @@ pre-commit==4.0.1
|
|||
# type checking
|
||||
mypy==1.11.1
|
||||
types-PyYAML
|
||||
types-regex
|
||||
types-requests
|
||||
types-setuptools
|
||||
|
|
|
@ -25,6 +25,3 @@ numba
|
|||
--pre
|
||||
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
torch-npu==2.5.1.post1.dev20250619
|
||||
|
||||
# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470
|
||||
transformers==4.52.4
|
||||
|
|
|
@ -39,14 +39,15 @@ from vllm.sampling_params import BeamSearchParams
|
|||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
# TODO: remove this part after the patch merged into vllm, if
|
||||
# we not explicitly patch here, some of them might be effectiveless
|
||||
# in pytest scenario
|
||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||
|
||||
adapt_patch(True)
|
||||
adapt_patch(False)
|
||||
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel)
|
||||
|
@ -61,7 +62,7 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
|
|||
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
||||
|
||||
_TEST_DIR = os.path.dirname(__file__)
|
||||
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
|
||||
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
||||
|
||||
|
||||
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|
@ -30,7 +30,6 @@ import pytest
|
|||
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(True, reason="TODO: fix dp timeout error in ci")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
||||
|
|
|
@ -26,12 +26,11 @@ from unittest.mock import patch
|
|||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
@patch.dict(
|
||||
os.environ, {
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
|
||||
|
@ -56,12 +55,10 @@ def test_generate_with_allgather():
|
|||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@patch.dict(
|
||||
os.environ, {
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
"TASK_QUEUE_ENABLE": "1"
|
||||
})
|
||||
@patch.dict(os.environ, {
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
"TASK_QUEUE_ENABLE": "1"
|
||||
})
|
||||
def test_generate_with_alltoall():
|
||||
example_prompts = ["Hello, my name is"]
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
||||
|
@ -79,4 +76,4 @@ def test_generate_with_alltoall():
|
|||
},
|
||||
"expert_tensor_parallel_size": 1
|
||||
}) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
MODEL_PATH, do_sample)
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ from modelscope import snapshot_download # type: ignore
|
|||
from vllm import SamplingParams
|
||||
from vllm.model_executor.models.registry import ModelRegistry
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
#
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B",
|
||||
|
|
|
@ -2,12 +2,10 @@
|
|||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.model_utils import check_outputs_equal
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
# for MHA
|
||||
|
@ -60,8 +58,6 @@ INPUT_PROMPTS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="mtp is not supported on v1")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||
|
@ -89,8 +85,6 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="mtp is not supported on v1")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [50])
|
||||
def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
|
|
|
@ -22,9 +22,7 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`.
|
|||
import os
|
||||
from typing import Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
|
@ -78,8 +76,6 @@ def _deepseek_torchair_test_fixture(
|
|||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="torchair graph is not supported on v0")
|
||||
def test_e2e_deepseekv3_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
|
@ -89,8 +85,6 @@ def test_e2e_deepseekv3_with_torchair():
|
|||
_deepseek_torchair_test_fixture(additional_config)
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="torchair graph is not supported on v0")
|
||||
def test_e2e_deepseekv3_with_torchair_ms_mla():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
|
@ -150,8 +144,6 @@ def _pangu_torchair_test_fixture(
|
|||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="torchair graph is not supported on v0")
|
||||
def test_e2e_pangu_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import LLM
|
||||
|
||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
||||
|
||||
MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
|
||||
PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model():
|
||||
llm = LLM(
|
||||
MODEL,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
max_num_batched_tokens=200,
|
||||
max_num_seqs=3,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
}})
|
||||
yield llm
|
||||
del llm
|
||||
torch.npu.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(model):
|
||||
outputs = model.generate([PROMPT] * 3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded(model):
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = model.generate([input_tokens])
|
||||
outputs = model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
|
@ -1,60 +0,0 @@
|
|||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Compare the with and without chunked prefill on AscendScheduler
|
||||
|
||||
It tests chunked prefill. Chunked prefill can be enabled by
|
||||
`additional_config={'ascend_scheduler_config': {'enabled': True, 'enable_chunked_prefill': True,},}`.
|
||||
If prefill size exceeds max_num_batched_tokens, prefill requests are chunked.
|
||||
|
||||
Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B-Base",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_with_ascend_scheduler(
|
||||
example_prompts, model: str, max_tokens: int,
|
||||
chunked_prefill_token_size: int) -> None:
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_chunked_prefill': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunked_prefill_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=chunked_prefill_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
|
@ -1,190 +0,0 @@
|
|||
# Copyright (c) China Merchants Bank Co., Ltd. 2025. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#/
|
||||
|
||||
# to run this test, you need to cd to the upper package which is 'tests',
|
||||
# and run with command 'pytest -s ops/test_multi_step.py'
|
||||
|
||||
import torch
|
||||
import torch_npu # noqa: F401
|
||||
|
||||
DTYPES = [torch.int32, torch.int64]
|
||||
DEVICES = [f"npu:{0}"]
|
||||
# Set tolerance to 0 for equals
|
||||
DEFAULT_ATOL = 0
|
||||
DEFAULT_RTOL = 0
|
||||
|
||||
# test custom ops of https://github.com/vllm-project/vllm-ascend/tree/main/csrc/kernels/advance_step.cpp
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_single_generation_multi_step() -> None:
|
||||
input_tokens_data = [2926]
|
||||
input_tokens_ascendc = torch.tensor(input_tokens_data, device='npu:0')
|
||||
input_tokens_python = torch.tensor(input_tokens_data, device='npu:0')
|
||||
|
||||
sampled_token_ids_data = [[13]]
|
||||
sampled_token_ids = torch.tensor(sampled_token_ids_data, device='npu:0')
|
||||
|
||||
input_positions_data = [5]
|
||||
input_positions_ascendc = torch.tensor(input_positions_data,
|
||||
device='npu:0')
|
||||
input_positions_python = torch.tensor(input_positions_data, device='npu:0')
|
||||
|
||||
seq_lens_data = [6]
|
||||
seq_lens_ascendc = torch.tensor(seq_lens_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
seq_lens_python = torch.tensor(seq_lens_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
slot_mapping_data = [5]
|
||||
slot_mapping_ascendc = torch.tensor(slot_mapping_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
slot_mapping_python = torch.tensor(slot_mapping_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
block_tables_data = [[0]]
|
||||
|
||||
block_tables = torch.tensor(block_tables_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
torch.ops._C.advance_step_flashattn_ascendc(
|
||||
1, 1, 128, input_tokens_ascendc, sampled_token_ids,
|
||||
input_positions_ascendc, seq_lens_ascendc, slot_mapping_ascendc,
|
||||
block_tables)
|
||||
|
||||
normal(1, 1, 128, input_tokens_python, sampled_token_ids,
|
||||
input_positions_python, seq_lens_python, slot_mapping_python,
|
||||
block_tables)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(input_tokens_ascendc,
|
||||
input_tokens_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(input_positions_ascendc,
|
||||
input_positions_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(seq_lens_ascendc,
|
||||
seq_lens_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(slot_mapping_ascendc,
|
||||
slot_mapping_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_multi_result_generation_multi_step() -> None:
|
||||
input_tokens_data = [2926, 279, 12095, 1588]
|
||||
input_tokens_ascendc = torch.tensor(input_tokens_data, device='npu:0')
|
||||
input_tokens_python = torch.tensor(input_tokens_data, device='npu:0')
|
||||
|
||||
sampled_token_ids_data = [[13], [1968], [13], [13]]
|
||||
sampled_token_ids = torch.tensor(sampled_token_ids_data, device='npu:0')
|
||||
|
||||
input_positions_data = [5, 7, 5, 5]
|
||||
input_positions_ascendc = torch.tensor(input_positions_data,
|
||||
device='npu:0')
|
||||
input_positions_python = torch.tensor(input_positions_data, device='npu:0')
|
||||
|
||||
seq_lens_data = [6, 8, 6, 6]
|
||||
seq_lens_ascendc = torch.tensor(seq_lens_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
seq_lens_python = torch.tensor(seq_lens_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
slot_mapping_data = [5, 135, 261, 389]
|
||||
slot_mapping_ascendc = torch.tensor(slot_mapping_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
slot_mapping_python = torch.tensor(slot_mapping_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
block_tables_data = [[0], [1], [2], [3]]
|
||||
|
||||
block_tables = torch.tensor(block_tables_data,
|
||||
device='npu:0',
|
||||
dtype=torch.int32)
|
||||
|
||||
torch.ops._C.advance_step_flashattn_ascendc(
|
||||
4, 4, 128, input_tokens_ascendc, sampled_token_ids,
|
||||
input_positions_ascendc, seq_lens_ascendc, slot_mapping_ascendc,
|
||||
block_tables)
|
||||
|
||||
normal(4, 4, 128, input_tokens_python, sampled_token_ids,
|
||||
input_positions_python, seq_lens_python, slot_mapping_python,
|
||||
block_tables)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(input_tokens_ascendc,
|
||||
input_tokens_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(input_positions_ascendc,
|
||||
input_positions_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(seq_lens_ascendc,
|
||||
seq_lens_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
torch.testing.assert_close(slot_mapping_ascendc,
|
||||
slot_mapping_python,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
|
||||
|
||||
def normal(num_seqs: int, num_queries: int, block_size: int,
|
||||
input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
|
||||
input_positions: torch.Tensor, seq_lens_tensor: torch.Tensor,
|
||||
slot_mapping: torch.Tensor, block_tables: torch.Tensor) -> None:
|
||||
sampled_token_ids_list = sampled_token_ids[:num_queries].squeeze(-1)
|
||||
input_tokens[:num_queries] = sampled_token_ids_list
|
||||
|
||||
# get seq_lens and input_positions
|
||||
seq_lens = seq_lens_tensor[:num_queries]
|
||||
next_seq_lens = seq_lens + 1
|
||||
next_input_pos = next_seq_lens - 1
|
||||
|
||||
# update seq_lens and input_positions
|
||||
seq_lens_tensor[:num_queries] = next_seq_lens
|
||||
input_positions[:num_queries] = next_input_pos # type: ignore
|
||||
|
||||
# get block index and offset
|
||||
block_idx = next_input_pos // block_size
|
||||
block_offset = next_input_pos % block_size
|
||||
|
||||
current_block_table = block_tables.gather(
|
||||
1, block_idx.unsqueeze(-1)).squeeze(-1)
|
||||
slot_num = current_block_table * block_size + block_offset
|
||||
|
||||
# update slot_mapping
|
||||
slot_mapping[:num_queries] = slot_num
|
|
@ -53,7 +53,6 @@ def model_name():
|
|||
@pytest.mark.skipif(
|
||||
True, reason="TODO: Enable me after test_mtp_correctness is fixed")
|
||||
def test_mtp_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
|
@ -62,33 +61,30 @@ def test_mtp_correctness(
|
|||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using mtp speculative decoding.
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
spec_llm = LLM(model=model_name,
|
||||
trust_remote_code=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": 1,
|
||||
},
|
||||
max_model_len=256,
|
||||
enforce_eager=True)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
spec_llm = LLM(model=model_name,
|
||||
trust_remote_code=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": 1,
|
||||
},
|
||||
max_model_len=256,
|
||||
enforce_eager=True)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
|
|
@ -60,7 +60,6 @@ def eagle3_model_name():
|
|||
|
||||
|
||||
def test_ngram_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
|
@ -70,44 +69,40 @@ def test_ngram_correctness(
|
|||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
pytest.skip("Not current support for the test.")
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 70% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
# Heuristic: expect at least 70% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
||||
def test_eagle_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
|
@ -119,43 +114,40 @@ def test_eagle_correctness(
|
|||
'''
|
||||
if not use_eagle3:
|
||||
pytest.skip("Not current support for the test.")
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
spec_model_name = eagle3_model_name(
|
||||
) if use_eagle3 else eagle_model_name()
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
gpu_memory_utilization=0.6,
|
||||
speculative_config={
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=True,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
gpu_memory_utilization=0.6,
|
||||
speculative_config={
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 2,
|
||||
"max_model_len": 128,
|
||||
},
|
||||
max_model_len=128,
|
||||
enforce_eager=True,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
|
|
@ -20,14 +20,12 @@ Compare the outputs of vLLM with and without aclgraph.
|
|||
Run `pytest tests/compile/test_aclgraph.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.model_utils import check_outputs_equal
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
|
@ -36,37 +34,29 @@ MODELS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="aclgraph only support on v1")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models(
|
||||
def test_models_with_aclgraph(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
|
||||
# aclgraph only support on v1
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
# TODO: change to use vllmrunner when the registry of custom op is solved
|
||||
# while running pytest
|
||||
vllm_model = LLM(model, max_model_len=1024)
|
||||
vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
temperature=0.0)
|
||||
# TODO: change to use vllmrunner when the registry of custom op is solved
|
||||
# while running pytest
|
||||
vllm_model = LLM(model)
|
||||
vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
|
||||
vllm_model = LLM(model, enforce_eager=True)
|
||||
vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
vllm_model = LLM(model, enforce_eager=True, max_model_len=1024)
|
||||
vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
|
||||
vllm_aclgraph_outputs_list = []
|
||||
for output in vllm_aclgraph_outputs:
|
||||
|
@ -86,12 +76,9 @@ def test_models(
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="aclgraph only support on v1")
|
||||
def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_MODELSCOPE", "True")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with pytest.raises(NotImplementedError) as excinfo:
|
||||
VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
max_model_len=1024,
|
||||
|
|
|
@ -15,13 +15,17 @@ from vllm.v1.outputs import ModelRunnerOutput
|
|||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
from vllm_ascend.core.scheduler import AscendScheduler
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
MODEL = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
def create_scheduler(
|
||||
model: str = "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
model: str = MODEL,
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: Optional[bool] = None,
|
||||
|
@ -303,6 +307,8 @@ def test_stop_via_update_from_output():
|
|||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
if not vllm_version_is("0.9.2"):
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
|
@ -355,6 +361,8 @@ def test_stop_via_update_from_output():
|
|||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
if not vllm_version_is("0.9.2"):
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
|
@ -405,6 +413,8 @@ def test_stop_via_update_from_output():
|
|||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
if not vllm_version_is("0.9.2"):
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
|
@ -726,3 +736,83 @@ def test_memory_leak():
|
|||
|
||||
# Confirm no memory leak.
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill():
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=200,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||
3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded():
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=3,
|
||||
max_num_batched_tokens=200,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
||||
_ = vllm_model.model.generate([input_tokens])
|
||||
outputs = vllm_model.model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 128
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens",
|
||||
[4]) # cannot align results when max_tokens > 4
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_with_ascend_scheduler(
|
||||
example_prompts, max_tokens: int,
|
||||
chunked_prefill_token_size: int) -> None:
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_chunked_prefill': True,
|
||||
},
|
||||
},
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunked_prefill_output = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(MODEL,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
},
|
||||
},
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
outputs_1_lst=chunked_prefill_output,
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
|
@ -21,7 +21,7 @@ import torch
|
|||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from tests.e2e.utils import fork_new_process_for_each_test
|
||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||
|
||||
|
||||
|
|
|
@ -20,8 +20,6 @@ Compare the outputs of vLLM with and without aclgraph.
|
|||
Run `pytest tests/compile/test_aclgraph.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
|
@ -29,8 +27,6 @@ from vllm import LLM, SamplingParams
|
|||
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="new chunked only support on v1")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [1])
|
||||
def test_models(
|
||||
|
@ -39,36 +35,33 @@ def test_models(
|
|||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
return
|
||||
with monkeypatch.context() as m:
|
||||
prompts = "The president of the United States is"
|
||||
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
prompts = "The president of the United States is"
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
vllm_model = LLM(model,
|
||||
long_prefill_token_threshold=4,
|
||||
enforce_eager=True)
|
||||
output_chunked = vllm_model.generate(prompts, sampling_params)
|
||||
logprobs_chunked = output_chunked.outputs[0].logprobs
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
vllm_model = LLM(model, long_prefill_token_threshold=4, enforce_eager=True)
|
||||
output_chunked = vllm_model.generate(prompts, sampling_params)
|
||||
logprobs_chunked = output_chunked.outputs[0].logprobs
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
|
||||
vllm_model = LLM(model,
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True
|
||||
},
|
||||
})
|
||||
output = vllm_model.generate(prompts, sampling_params)
|
||||
logprobs = output.outputs[0].logprobs
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
vllm_model = LLM(model,
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True
|
||||
},
|
||||
})
|
||||
output = vllm_model.generate(prompts, sampling_params)
|
||||
logprobs = output.outputs[0].logprobs
|
||||
del vllm_model
|
||||
torch.npu.empty_cache()
|
||||
|
||||
logprobs_similarity = torch.cosine_similarity(
|
||||
logprobs_chunked.flatten(), logprobs.flatten(), dim=0)
|
||||
assert logprobs_similarity > 0.95
|
||||
logprobs_similarity = torch.cosine_similarity(logprobs_chunked.flatten(),
|
||||
logprobs.flatten(),
|
||||
dim=0)
|
||||
assert logprobs_similarity > 0.95
|
||||
|
|
|
@ -21,8 +21,8 @@ from typing import Optional
|
|||
|
||||
from modelscope import snapshot_download # type: ignore[import-untyped]
|
||||
|
||||
from tests.conftest import HfRunner
|
||||
from tests.utils import check_embeddings_close, matryoshka_fy
|
||||
from tests.e2e.conftest import HfRunner
|
||||
from tests.e2e.utils import check_embeddings_close, matryoshka_fy
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
|
|
|
@ -18,22 +18,19 @@
|
|||
#
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
import regex as re
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
|
||||
|
||||
GuidedDecodingBackendV0 = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
GuidedDecodingBackendV1 = ["xgrammar", "guidance"]
|
||||
GuidedDecodingBackend = list(
|
||||
set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1))
|
||||
GuidedDecodingBackend = ["xgrammar", "guidance"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
@ -84,20 +81,9 @@ def sample_json_schema():
|
|||
}
|
||||
|
||||
|
||||
def check_backend(guided_decoding_backend: str):
|
||||
if guided_decoding_backend not in GuidedDecodingBackendV0 and os.getenv(
|
||||
"VLLM_USE_V1") == "0":
|
||||
pytest.skip(f"{guided_decoding_backend} does not support v0, skip it.")
|
||||
if guided_decoding_backend not in GuidedDecodingBackendV1 and os.getenv(
|
||||
"VLLM_USE_V1") == "1":
|
||||
pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_json_completion(guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
check_backend(guided_decoding_backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
|
@ -134,8 +120,6 @@ def test_guided_json_completion(guided_decoding_backend: str,
|
|||
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_regex(guided_decoding_backend: str, sample_regex):
|
||||
check_backend(guided_decoding_backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
|
|
|
@ -3,7 +3,7 @@ import vllm
|
|||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODEL_PATH = "vllm-ascend/ilama-3.2-1B"
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ from vllm import SamplingParams
|
|||
from vllm.assets.image import ImageAsset
|
||||
|
||||
import vllm_ascend # noqa: F401
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
|
|
|
@ -1,259 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
|
||||
#
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
if not hasattr(EngineArgs, "enable_prompt_embeds"):
|
||||
pytest.skip("Not supported vllm version", allow_module_level=True)
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = snapshot_download("LLM-Research/Llama-3.2-1B-Instruct")
|
||||
|
||||
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args() -> list[str]:
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# Prompt Embeds server args
|
||||
"--enable-prompt-embeds",
|
||||
"--no-enable-chunked-prefill",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=["", "--disable-frontend-multiprocessing"])
|
||||
def server_with_prompt_embeds(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_with_prompt_embeds(server_with_prompt_embeds):
|
||||
async with server_with_prompt_embeds.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def create_dummy_embeds(num_tokens: int = 5) -> str:
|
||||
"""Create dummy embeddings and return them as base64 encoded string."""
|
||||
dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
|
||||
buffer = io.BytesIO()
|
||||
torch.save(dummy_embeds, buffer)
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("VLLM_USE_V1") == "1",
|
||||
reason="Enable embedding input will fallback to v0, skip it")
|
||||
async def test_completions_with_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
|
||||
# Test case: Single prompt embeds input
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
# Test case: batch completion with prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
|
||||
assert len(completion.choices) == 2
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert len(completion.choices[1].text) >= 1
|
||||
|
||||
# Test case: streaming with prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
single_completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
single_output = single_completion.choices[0].text
|
||||
|
||||
stream = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
chunks = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
# Test case: batch streaming with prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
stream = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
|
||||
chunks_stream_embeds: list[list[str]] = [[], []]
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks_stream_embeds[chunk.choices[0].index].append(
|
||||
chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == 2
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert len(chunks_stream_embeds[0]) > 0
|
||||
assert len(chunks_stream_embeds[1]) > 0
|
||||
|
||||
# Test case: mixed text and prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion_mixed = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="This is a prompt",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
assert len(completion.choices) == 2
|
||||
completion_text_only = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="This is a prompt",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion_embeds_only = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
# Embeddings responses should be handled first
|
||||
assert completion_mixed.choices[0].text == completion_embeds_only.choices[
|
||||
0].text
|
||||
assert completion_mixed.choices[1].text == completion_text_only.choices[
|
||||
0].text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("VLLM_USE_V1") == "1",
|
||||
reason="Enable embedding input will fallback to v0, skip it")
|
||||
async def test_completions_errors_with_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
|
||||
# Test error case: invalid prompt_embeds
|
||||
with pytest.raises(BadRequestError):
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
prompt="",
|
||||
model=model_name,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={"prompt_embeds": "invalid_base64"})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("VLLM_USE_V1") == "1",
|
||||
reason="Enable embedding input will fallback to v0, skip it")
|
||||
async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int,
|
||||
model_name: str):
|
||||
# Test case: Logprobs using prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=False,
|
||||
logprobs=logprobs_arg,
|
||||
extra_body={"prompt_embeds": encoded_embeds})
|
||||
|
||||
logprobs = completion.choices[0].logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) == 5
|
||||
assert len(logprobs.token_logprobs) == 5
|
||||
assert len(logprobs.top_logprobs) == 5
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) == 5
|
||||
|
||||
# Test case: Log probs with batch completion and prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=False,
|
||||
logprobs=logprobs_arg,
|
||||
extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
|
||||
|
||||
assert len(completion.choices) == 2
|
||||
for choice in completion.choices:
|
||||
logprobs = choice.logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) == 5
|
||||
assert len(logprobs.token_logprobs) == 5
|
||||
assert len(logprobs.top_logprobs) == 5
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg,
|
||||
1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) == 5
|
|
@ -1,390 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheTensor)
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.core.scheduler import AscendScheduler
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
|
||||
def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: Optional[bool] = None,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
) -> AscendScheduler:
|
||||
'''Create scheduler under test.
|
||||
|
||||
Args:
|
||||
model: model under test
|
||||
max_num_seqs: max sequences to schedule
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(None)
|
||||
|
||||
Returns:
|
||||
:class:`Scheduler` instance
|
||||
'''
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_num_batched_tokens,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = ({} if enable_prefix_caching is None else {
|
||||
'enable_prefix_caching': enable_prefix_caching
|
||||
})
|
||||
cache_config = CacheConfig(
|
||||
block_size=16,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
**kwargs_cache,
|
||||
)
|
||||
vllm_config = VllmConfig(scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
cache_config=cache_config)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=10000, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(16, 1, 1, torch.float32, False,
|
||||
None))
|
||||
],
|
||||
)
|
||||
cache_config.num_gpu_blocks = 10000
|
||||
return AscendScheduler(
|
||||
vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
|
||||
def create_requests(num_requests: int,
|
||||
num_tokens: int = 10,
|
||||
mm_positions: Optional[list[PlaceholderRange]] = None,
|
||||
max_tokens: int = 16,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
prompt_logprobs: Optional[int] = None):
|
||||
sampling_params = SamplingParams(ignore_eos=False,
|
||||
max_tokens=max_tokens,
|
||||
stop_token_ids=stop_token_ids,
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
mm_inputs = [MultiModalKwargs({})] * len(mm_position)
|
||||
else:
|
||||
mm_position = None
|
||||
mm_inputs = None
|
||||
request = Request(
|
||||
request_id=f"{i}",
|
||||
prompt_token_ids=[i] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
multi_modal_inputs=mm_inputs,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
pooling_params=None,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
|
||||
|
||||
def test_add_requests():
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=10)
|
||||
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
assert request.request_id in scheduler.requests
|
||||
assert len(scheduler.waiting) == i + 1
|
||||
|
||||
|
||||
def test_finish_request():
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=10)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.finish_requests(request.request_id,
|
||||
RequestStatus.FINISHED_ABORTED)
|
||||
assert request.request_id not in scheduler.requests
|
||||
assert len(scheduler.waiting) == 9 - i
|
||||
|
||||
|
||||
def test_get_num_unfinished_requests():
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=10)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.finish_requests(request.request_id,
|
||||
RequestStatus.FINISHED_STOPPED)
|
||||
assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
|
||||
(None, None),
|
||||
(True, 5),
|
||||
])
|
||||
def test_schedule(enable_prefix_caching: Optional[bool],
|
||||
prompt_logprobs: Optional[int]):
|
||||
'''Test scheduling.
|
||||
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
||||
'''
|
||||
scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
|
||||
requests = create_requests(num_requests=10,
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
# Test initial scheduling
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == len(requests)
|
||||
assert output.scheduled_cached_reqs.num_reqs == 0
|
||||
assert len(output.finished_req_ids) == 0
|
||||
# Verify all requests are scheduled.
|
||||
for req_id, num_tokens in output.num_scheduled_tokens.items():
|
||||
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
|
||||
|
||||
# Verify requests moved from waiting to running
|
||||
assert len(scheduler.waiting) == 0
|
||||
assert len(scheduler.running) == len(requests)
|
||||
for i, request in enumerate(requests):
|
||||
assert scheduler.running[i] == request
|
||||
|
||||
|
||||
def test_stop_via_update_from_output():
|
||||
"""Test stopping behavior through update_from_output"""
|
||||
scheduler = create_scheduler()
|
||||
|
||||
# Test case 1: Stop on EOS token
|
||||
requests = create_requests(num_requests=2, max_tokens=10)
|
||||
for req in requests:
|
||||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
scheduler.scheduled_req_ids.add(req.request_id)
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 1,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
scheduled_spec_decode_tokens={},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID],
|
||||
[10,
|
||||
11]], # First request hits EOS, second continues
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
# Verify first request stopped, second continues
|
||||
assert len(scheduler.running) == 1
|
||||
assert scheduler.running[0].request_id == requests[1].request_id
|
||||
assert requests[0].status == RequestStatus.FINISHED_STOPPED
|
||||
assert requests[0].request_id in scheduler.finished_req_ids
|
||||
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
|
||||
assert list(requests[1].output_token_ids) == [10, 11]
|
||||
|
||||
# Test case 2: Stop on custom stop token
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=[42, 43])
|
||||
for req in requests:
|
||||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
scheduler.scheduled_req_ids.add(req.request_id)
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
scheduled_spec_decode_tokens={},
|
||||
total_num_scheduled_tokens=5,
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
# Verify first request stopped on custom token
|
||||
assert len(scheduler.running) == 1
|
||||
assert scheduler.running[0].request_id == requests[1].request_id
|
||||
assert requests[0].status == RequestStatus.FINISHED_STOPPED
|
||||
assert requests[0].stop_reason == 42
|
||||
assert requests[0].request_id in scheduler.finished_req_ids
|
||||
assert list(requests[0].output_token_ids) == [10, 42]
|
||||
assert list(requests[1].output_token_ids) == [13, 14]
|
||||
|
||||
# Test case 3: Stop on max tokens
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=2, max_tokens=2)
|
||||
for req in requests:
|
||||
req.num_computed_tokens = req.num_tokens
|
||||
scheduler.requests[req.request_id] = req
|
||||
scheduler.running.append(req)
|
||||
scheduler.scheduled_req_ids.add(req.request_id)
|
||||
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 1
|
||||
},
|
||||
scheduled_spec_decode_tokens={},
|
||||
total_num_scheduled_tokens=4,
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
# Verify first request stopped due to length
|
||||
assert len(scheduler.running) == 1
|
||||
assert scheduler.running[0].request_id == requests[1].request_id
|
||||
assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
assert requests[0].request_id in scheduler.finished_req_ids
|
||||
assert list(requests[0].output_token_ids) == [10, 11
|
||||
] # Truncated to max_tokens
|
||||
assert list(requests[1].output_token_ids) == [13]
|
||||
|
||||
# Test case 4: Ignore EOS flag
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=1, max_tokens=10)
|
||||
requests[0].sampling_params.ignore_eos = True
|
||||
requests[0].num_computed_tokens = requests[0].num_tokens
|
||||
scheduler.requests[requests[0].request_id] = requests[0]
|
||||
scheduler.running.append(requests[0])
|
||||
scheduler.scheduled_req_ids.add(requests[0].request_id)
|
||||
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={requests[0].request_id: 3},
|
||||
scheduled_spec_decode_tokens={},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
# Verify request continues past EOS
|
||||
assert len(scheduler.running) == 1
|
||||
assert not requests[0].is_finished()
|
||||
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
|
|
@ -0,0 +1,106 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/tests/utils.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import functools
|
||||
import os
|
||||
import signal
|
||||
from collections.abc import Sequence
|
||||
from typing import Callable
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def fork_new_process_for_each_test(
|
||||
f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
"""Decorator to fork a new process for each test function.
|
||||
See https://github.com/vllm-project/vllm/issues/7053 for more details.
|
||||
"""
|
||||
|
||||
@functools.wraps(f)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||
# Make the process the leader of its own process group
|
||||
# to avoid sending SIGTERM to the parent process
|
||||
os.setpgrp()
|
||||
from _pytest.outcomes import Skipped
|
||||
pid = os.fork()
|
||||
print(f"Fork a new process to run a test {pid}")
|
||||
if pid == 0:
|
||||
try:
|
||||
f(*args, **kwargs)
|
||||
except Skipped as e:
|
||||
# convert Skipped to exit code 0
|
||||
print(str(e))
|
||||
os._exit(0)
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
os._exit(1)
|
||||
else:
|
||||
os._exit(0)
|
||||
else:
|
||||
pgid = os.getpgid(pid)
|
||||
_pid, _exitcode = os.waitpid(pid, 0)
|
||||
# ignore SIGTERM signal itself
|
||||
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||
# kill all child processes
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
# restore the signal handler
|
||||
signal.signal(signal.SIGTERM, old_signal_handler)
|
||||
assert _exitcode == 0, (f"function {f} failed when called with"
|
||||
f" args {args} and kwargs {kwargs}")
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
|
||||
tensor = torch.tensor(tensor)
|
||||
tensor = tensor[..., :dimensions]
|
||||
tensor = F.normalize(tensor, p=2, dim=1)
|
||||
return tensor
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
*,
|
||||
embeddings_0_lst: Sequence[list[float]],
|
||||
embeddings_1_lst: Sequence[list[float]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
tol: float = 1e-3,
|
||||
) -> None:
|
||||
assert len(embeddings_0_lst) == len(embeddings_1_lst)
|
||||
|
||||
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
|
||||
zip(embeddings_0_lst, embeddings_1_lst)):
|
||||
assert len(embeddings_0) == len(embeddings_1), (
|
||||
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
|
||||
|
||||
sim = F.cosine_similarity(torch.tensor(embeddings_0),
|
||||
torch.tensor(embeddings_1),
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\nCosine similarity: \t{sim:.4f}"
|
||||
f"\n{name_0}:\t{embeddings_0[:16]!r}"
|
||||
f"\n{name_1}:\t{embeddings_1[:16]!r}")
|
||||
|
||||
assert sim >= 1 - tol, fail_msg
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
import unittest
|
||||
|
||||
from vllm_ascend.utils import adapt_patch
|
||||
from vllm_ascend.utils import adapt_patch, register_ascend_customop
|
||||
|
||||
# fused moe ops test will hit the infer_schema error, we need add the patch
|
||||
# here to make the test pass.
|
||||
|
@ -28,4 +28,5 @@ class TestBase(unittest.TestCase):
|
|||
# adapt patch by default.
|
||||
adapt_patch(True)
|
||||
adapt_patch()
|
||||
register_ascend_customop()
|
||||
super().setUp()
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -14,3 +15,12 @@
|
|||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
from vllm_ascend.utils import adapt_patch # noqa E402
|
||||
from vllm_ascend.utils import register_ascend_customop
|
||||
|
||||
adapt_patch()
|
||||
adapt_patch(True)
|
||||
|
||||
# register Ascend CustomOp here because uts will use this
|
||||
register_ascend_customop()
|
|
@ -0,0 +1,61 @@
|
|||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_tensor():
|
||||
return torch.randn(4, 8, dtype=torch.float16)
|
||||
|
||||
|
||||
@patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1)
|
||||
def test_QuickGELU_forward(mock_gelu, dummy_tensor):
|
||||
layer = QuickGELU()
|
||||
out = layer.forward(dummy_tensor)
|
||||
|
||||
expected_out = dummy_tensor + 1
|
||||
assert torch.allclose(out, expected_out)
|
||||
|
||||
mock_gelu.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_310p_return", [True, False])
|
||||
@patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1)
|
||||
def test_SiluAndMul_forward(mock_swiglu, is_310p_return, dummy_tensor):
|
||||
|
||||
with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return):
|
||||
layer = SiluAndMul()
|
||||
out = layer.forward(dummy_tensor)
|
||||
|
||||
if is_310p_return:
|
||||
expected_arg = dummy_tensor.to(torch.float32)
|
||||
else:
|
||||
expected_arg = dummy_tensor
|
||||
|
||||
# assert mock_swiglu.call_count == 1
|
||||
mock_swiglu.assert_called_once()
|
||||
|
||||
actual_arg = mock_swiglu.call_args[0][0]
|
||||
assert torch.allclose(
|
||||
actual_arg,
|
||||
expected_arg), "npu_swiglu called with unexpected input"
|
||||
|
||||
expected_out = dummy_tensor + 1
|
||||
assert torch.allclose(out, expected_out)
|
|
@ -0,0 +1,134 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
||||
from vllm_ascend.quantization.func_wrapper import (wrapper_rmsnorm_forward_oot,
|
||||
wrapper_rmsnorm_init)
|
||||
|
||||
|
||||
class MockRMSNorm:
|
||||
|
||||
def __init__(self, hidden_size: int, **extra_args):
|
||||
self.hidden_size = hidden_size
|
||||
self.weight = torch.ones(hidden_size)
|
||||
self.input_scale = 1.0
|
||||
self.input_offset = 0.0
|
||||
self.variance_epsilon = 1e-6
|
||||
self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
|
||||
requires_grad=False)
|
||||
self.ignore_anti = extra_args.get('ignore_anti', True)
|
||||
|
||||
|
||||
class TestFuncWrapper(unittest.TestCase):
|
||||
|
||||
def test_wrapper_rmsnorm_init(self):
|
||||
|
||||
@wrapper_rmsnorm_init
|
||||
def init(self, hidden_size: int, **extra_args) -> None:
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
hidden_size = 128
|
||||
extra_args = {'arg1': 'value1'}
|
||||
|
||||
rms_norm = MockRMSNorm(hidden_size, **extra_args)
|
||||
init(rms_norm, hidden_size, **extra_args)
|
||||
|
||||
self.assertTrue(hasattr(rms_norm, 'ignore_anti'))
|
||||
self.assertTrue(rms_norm.ignore_anti)
|
||||
|
||||
self.assertTrue(hasattr(rms_norm, 'bias'))
|
||||
self.assertIsInstance(rms_norm.bias, torch.nn.Parameter)
|
||||
self.assertEqual(rms_norm.bias.shape, torch.Size([hidden_size]))
|
||||
self.assertFalse(rms_norm.bias.requires_grad)
|
||||
|
||||
@patch('torch_npu._npu_quant_rms_norm')
|
||||
def test_wrapper_rmsnorm_forward_oot_with_residual(
|
||||
self, mock_npu_quant_rms_norm):
|
||||
hidden_size = 128
|
||||
x = torch.randn(hidden_size)
|
||||
residual = torch.randn(hidden_size)
|
||||
expected_out = torch.randn(hidden_size)
|
||||
|
||||
mock_npu_quant_rms_norm.return_value = (expected_out, residual)
|
||||
|
||||
@wrapper_rmsnorm_forward_oot
|
||||
def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None):
|
||||
return x, residual
|
||||
|
||||
rms_norm = MockRMSNorm(hidden_size)
|
||||
rms_norm.ignore_anti = False
|
||||
|
||||
output, res = forward_oot(rms_norm, x, residual)
|
||||
|
||||
mock_npu_quant_rms_norm.assert_called_once()
|
||||
|
||||
args, kwargs = mock_npu_quant_rms_norm.call_args
|
||||
self.assertTrue(torch.equal(args[1], rms_norm.weight))
|
||||
self.assertTrue(torch.equal(args[2], rms_norm.bias))
|
||||
self.assertEqual(args[3], rms_norm.input_scale)
|
||||
self.assertEqual(args[4], rms_norm.input_offset)
|
||||
self.assertEqual(args[5], rms_norm.variance_epsilon)
|
||||
self.assertTrue(torch.equal(res, residual))
|
||||
|
||||
@patch('torch_npu._npu_quant_rms_norm')
|
||||
def test_wrapper_rmsnorm_forward_oot_without_residual(
|
||||
self, mock_npu_quant_rms_norm):
|
||||
hidden_size = 128
|
||||
x = torch.randn(hidden_size)
|
||||
expected_out = torch.randn(hidden_size)
|
||||
|
||||
mock_npu_quant_rms_norm.return_value = expected_out
|
||||
|
||||
@wrapper_rmsnorm_forward_oot
|
||||
def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None):
|
||||
return x
|
||||
|
||||
rms_norm = MockRMSNorm(hidden_size)
|
||||
rms_norm.ignore_anti = False
|
||||
|
||||
output = forward_oot(rms_norm, x)
|
||||
|
||||
mock_npu_quant_rms_norm.assert_called_once()
|
||||
|
||||
args, kwargs = mock_npu_quant_rms_norm.call_args
|
||||
self.assertTrue(torch.equal(args[0], x))
|
||||
self.assertTrue(torch.equal(args[1], rms_norm.weight))
|
||||
self.assertTrue(torch.equal(args[2], rms_norm.bias))
|
||||
self.assertEqual(args[3], rms_norm.input_scale)
|
||||
self.assertEqual(args[4], rms_norm.input_offset)
|
||||
self.assertEqual(args[5], rms_norm.variance_epsilon)
|
||||
|
||||
self.assertTrue(torch.equal(output, expected_out))
|
||||
|
||||
def test_wrapper_rmsnorm_forward_oot_ignore_anti_with_residual(self):
|
||||
hidden_size = 128
|
||||
x = torch.randn(hidden_size)
|
||||
residual = torch.randn(hidden_size)
|
||||
|
||||
@wrapper_rmsnorm_forward_oot
|
||||
def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None):
|
||||
return x, residual
|
||||
|
||||
rms_norm = MockRMSNorm(hidden_size)
|
||||
rms_norm.ignore_anti = True
|
||||
|
||||
output, res = forward_oot(rms_norm, x, residual)
|
||||
|
||||
self.assertTrue(torch.equal(output, x.add_(rms_norm.bias)))
|
||||
self.assertTrue(torch.equal(res, residual))
|
||||
|
||||
def test_wrapper_rmsnorm_forward_oot_ignore_anti_no_residual(self):
|
||||
hidden_size = 128
|
||||
x = torch.randn(hidden_size)
|
||||
|
||||
@wrapper_rmsnorm_forward_oot
|
||||
def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None):
|
||||
return x
|
||||
|
||||
rms_norm = MockRMSNorm(hidden_size)
|
||||
rms_norm.ignore_anti = True
|
||||
|
||||
output = forward_oot(rms_norm, x)
|
||||
|
||||
self.assertTrue(torch.equal(output, x.add_(rms_norm.bias)))
|
|
@ -14,7 +14,6 @@
|
|||
#
|
||||
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
@ -170,8 +169,30 @@ class TestAscendConfig(TestBase):
|
|||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
# For V1 engine
|
||||
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
@_clean_up_ascend_config
|
||||
def test_check_ascend_config_wrong_case(self):
|
||||
test_vllm_config = VllmConfig()
|
||||
|
||||
# torchair + eager mode
|
||||
with self.assertRaises(RuntimeError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
|
@ -179,86 +200,39 @@ class TestAscendConfig(TestBase):
|
|||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
enforce_eager = True
|
||||
check_ascend_config(test_vllm_config, enforce_eager)
|
||||
# torchair + non deepseek model
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
||||
fake_model_config = ModelConfig(model=model_path)
|
||||
fake_model_config.hf_config = PretrainedConfig()
|
||||
fake_model_config.hf_config.model_type = "llama"
|
||||
test_vllm_config.model_config = fake_model_config
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
# aclgraph + deepseek model
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
||||
fake_model_config = ModelConfig(model=model_path)
|
||||
fake_model_config.hf_config = PretrainedConfig()
|
||||
fake_model_config.hf_config.model_type = "deepseek"
|
||||
test_vllm_config.model_config = fake_model_config
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
@_clean_up_ascend_config
|
||||
def test_check_ascend_config_wrong_case(self):
|
||||
test_vllm_config = VllmConfig()
|
||||
# For V0 engine
|
||||
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}):
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, True)
|
||||
# For V1 engine
|
||||
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
|
||||
# torchair + eager mode
|
||||
with self.assertRaises(RuntimeError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
init_ascend_config(test_vllm_config)
|
||||
enforce_eager = True
|
||||
check_ascend_config(test_vllm_config, enforce_eager)
|
||||
# torchair + non deepseek model
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
model_path = os.path.join(os.path.dirname(__file__),
|
||||
"fake_weight")
|
||||
fake_model_config = ModelConfig(model=model_path)
|
||||
fake_model_config.hf_config = PretrainedConfig()
|
||||
fake_model_config.hf_config.model_type = "llama"
|
||||
test_vllm_config.model_config = fake_model_config
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
# aclgraph + deepseek model
|
||||
with self.assertRaises(NotImplementedError):
|
||||
test_vllm_config.additional_config = {
|
||||
"torchair_graph_config": {
|
||||
"enabled": False,
|
||||
},
|
||||
"refresh": True
|
||||
}
|
||||
model_path = os.path.join(os.path.dirname(__file__),
|
||||
"fake_weight")
|
||||
fake_model_config = ModelConfig(model=model_path)
|
||||
fake_model_config.hf_config = PretrainedConfig()
|
||||
fake_model_config.hf_config.model_type = "deepseek"
|
||||
test_vllm_config.model_config = fake_model_config
|
||||
init_ascend_config(test_vllm_config)
|
||||
check_ascend_config(test_vllm_config, False)
|
||||
|
||||
def test_check_torchair_supported(self):
|
||||
test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
|
||||
('qwen', False), ('llama', False)]
|
||||
|
|
|
@ -373,7 +373,6 @@ class TestNPUPlatform(TestBase):
|
|||
@patch("vllm_ascend.utils.is_310p", return_value=False)
|
||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm.envs.VLLM_USE_V1", True)
|
||||
def test_check_and_update_config_v1_worker_class_selection(
|
||||
self, mock_init_ascend, mock_check_ascend, mock_is_310p):
|
||||
mock_init_ascend.return_value = self.mock_ascend_config
|
||||
|
@ -389,73 +388,9 @@ class TestNPUPlatform(TestBase):
|
|||
"vllm_ascend.worker.worker_v1.NPUWorker",
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
||||
def test_check_and_update_config_speculative_worker_config(
|
||||
self, mock_init_ascend, mock_check_ascend):
|
||||
mock_init_ascend.return_value = self.mock_ascend_config
|
||||
self.mock_vllm_config.speculative_config = MagicMock()
|
||||
self.mock_vllm_config.speculative_config.disable_logprobs = True
|
||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
||||
|
||||
with patch.dict("os.environ", {}):
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
||||
import os
|
||||
|
||||
self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1")
|
||||
self.assertEqual(
|
||||
self.mock_vllm_config.parallel_config.worker_cls,
|
||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker",
|
||||
)
|
||||
self.assertEqual(
|
||||
self.mock_vllm_config.parallel_config.sd_worker_cls,
|
||||
"vllm_ascend.worker.worker.NPUWorker",
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
||||
def test_check_and_update_config_multi_step_worker_config(
|
||||
self, mock_init_ascend, mock_check_ascend):
|
||||
mock_init_ascend.return_value = self.mock_ascend_config
|
||||
self.mock_vllm_config.scheduler_config.is_multi_step = True
|
||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
||||
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
||||
self.assertEqual(
|
||||
self.mock_vllm_config.parallel_config.worker_cls,
|
||||
"vllm_ascend.worker.multi_step_worker.MultiStepWorker",
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm.envs.VLLM_USE_V1", False)
|
||||
def test_check_and_update_config_default_worker_config(
|
||||
self, mock_init_ascend, mock_check_ascend):
|
||||
mock_init_ascend.return_value = self.mock_ascend_config
|
||||
self.mock_vllm_config.parallel_config.worker_cls = "auto"
|
||||
self.mock_vllm_config.scheduler_config.is_multi_step = False
|
||||
|
||||
from vllm_ascend import platform
|
||||
|
||||
importlib.reload(platform)
|
||||
self.platform.check_and_update_config(self.mock_vllm_config)
|
||||
self.assertEqual(
|
||||
self.mock_vllm_config.parallel_config.worker_cls,
|
||||
"vllm_ascend.worker.worker.NPUWorker",
|
||||
)
|
||||
|
||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
||||
@patch("vllm_ascend.utils.is_310p", return_value=True)
|
||||
@patch("vllm.envs.VLLM_USE_V1", True)
|
||||
def test_check_and_update_config_310p_no_custom_ops(
|
||||
self, mock_is_310p, mock_init_ascend, mock_check_ascend):
|
||||
mock_init_ascend.return_value = self.mock_ascend_config
|
||||
|
@ -546,45 +481,6 @@ class TestNPUPlatform(TestBase):
|
|||
result,
|
||||
"vllm_ascend.attention.attention_v1.AscendAttentionBackend")
|
||||
|
||||
@patch('vllm_ascend.platform.get_ascend_config')
|
||||
def test_get_attn_backend_cls_use_mla_only(self, mock_get_ascend_config):
|
||||
mock_config = MagicMock()
|
||||
mock_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_get_ascend_config.return_value = mock_config
|
||||
|
||||
result = self.platform.get_attn_backend_cls(
|
||||
selected_backend="ascend",
|
||||
head_size=64,
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=False,
|
||||
use_mla=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
result,
|
||||
"vllm_ascend.attention.attention.AscendMLAAttentionBackend")
|
||||
|
||||
@patch('vllm_ascend.platform.get_ascend_config')
|
||||
def test_get_attn_backend_cls_default_case(self, mock_get_ascend_config):
|
||||
mock_config = MagicMock()
|
||||
mock_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_get_ascend_config.return_value = mock_config
|
||||
|
||||
result = self.platform.get_attn_backend_cls(
|
||||
selected_backend="ascend",
|
||||
head_size=64,
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=False,
|
||||
use_mla=False,
|
||||
)
|
||||
self.assertEqual(
|
||||
result, "vllm_ascend.attention.attention.AscendAttentionBackend")
|
||||
|
||||
def test_get_punica_wrapper(self):
|
||||
result = self.platform.get_punica_wrapper()
|
||||
self.assertEqual(
|
||||
|
|
|
@ -239,17 +239,27 @@ class TestUtils(TestBase):
|
|||
def test_vllm_version_is(self):
|
||||
with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}):
|
||||
with mock.patch("vllm.__version__", "1.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "2.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "1.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("2.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
with mock.patch("vllm.__version__", "2.0.0"):
|
||||
self.assertTrue(utils.vllm_version_is("2.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is("1.0.0"))
|
||||
self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0"))
|
||||
self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0"))
|
||||
# Test caching takes effect
|
||||
utils.vllm_version_is.cache_clear()
|
||||
utils.vllm_version_is("1.0.0")
|
||||
misses = utils.vllm_version_is.cache_info().misses
|
||||
hits = utils.vllm_version_is.cache_info().hits
|
||||
self.assertEqual(misses, 1)
|
||||
self.assertEqual(hits, 0)
|
||||
utils.vllm_version_is("1.0.0")
|
||||
hits = utils.vllm_version_is.cache_info().hits
|
||||
self.assertEqual(hits, 1)
|
||||
|
||||
def test_update_aclgraph_sizes(self):
|
||||
# max_num_batch_sizes < len(original_sizes)
|
||||
|
@ -301,6 +311,24 @@ class TestUtils(TestBase):
|
|||
self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
|
||||
"Delete kv cache bytes cache dir failed")
|
||||
|
||||
@mock.patch("vllm.model_executor.custom_op.CustomOp")
|
||||
@mock.patch("vllm_ascend.ops.activation.AscendQuickGELU")
|
||||
@mock.patch("vllm_ascend.ops.activation.AscendSiluAndMul")
|
||||
def test_register_ascend_customop(self, mock_ascend_silu_and_mul,
|
||||
mock_ascend_quick_gelu, mock_customop):
|
||||
utils._ASCEND_CUSTOMOP_IS_REIGISTERED = False
|
||||
|
||||
# ascend custom op is not registered
|
||||
utils.register_ascend_customop()
|
||||
# should call register_oot twice
|
||||
self.assertEqual(mock_customop.register_oot.call_count, 2)
|
||||
self.assertTrue(utils._ASCEND_CUSTOMOP_IS_REIGISTERED)
|
||||
|
||||
# ascend custom op is already registered
|
||||
utils.register_ascend_customop()
|
||||
# should not register_oot again, thus only called twice in this ut
|
||||
self.assertEqual(mock_customop.register_oot.call_count, 2)
|
||||
|
||||
|
||||
class TestProfileExecuteDuration(unittest.TestCase):
|
||||
|
||||
|
|
|
@ -1,355 +0,0 @@
|
|||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sequence import SequenceData, SequenceGroupMetadata
|
||||
|
||||
from vllm_ascend.worker.pooling_model_runner import (
|
||||
ModelInputForNPUWithPoolingMetadata, NPUPoolingModelRunner)
|
||||
|
||||
|
||||
class TestPoolingModelRunner(unittest.TestCase):
|
||||
"""Unit tests for the NPUPoolingModelRunner class."""
|
||||
|
||||
def _create_model_runner(self, model: str, *args,
|
||||
**kwargs) -> NPUPoolingModelRunner:
|
||||
engine_args = EngineArgs(model, *args, **kwargs)
|
||||
engine_config = engine_args.create_engine_config()
|
||||
model_runner = NPUPoolingModelRunner(vllm_config=engine_config, )
|
||||
return model_runner
|
||||
|
||||
def setUp(self):
|
||||
"""Initialize test fixtures and common mocks"""
|
||||
self.attn_backend = "npu"
|
||||
|
||||
model_runner = self._create_model_runner(
|
||||
"tests/ut/fake_weight",
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
self.runner = model_runner
|
||||
self.runner.attn_backend = self.attn_backend
|
||||
model_runner.model = MagicMock()
|
||||
self.runner = model_runner
|
||||
# Sample test data
|
||||
self.sample_tensor_dict = {"tensor1": torch.randn(3, 4)}
|
||||
self.sample_seq_group = [MagicMock(spec=SequenceGroupMetadata)]
|
||||
self.sample_finished_ids = ["req1", "req2"]
|
||||
|
||||
@patch(
|
||||
'vllm_ascend.worker.pooling_model_runner.ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict'
|
||||
)
|
||||
def test_make_model_input_from_broadcasted_tensor_dict(
|
||||
self, mock_from_dict):
|
||||
"""Test tensor dictionary conversion to model input"""
|
||||
# Setup mock return
|
||||
expected_output = MagicMock()
|
||||
mock_from_dict.return_value = expected_output
|
||||
|
||||
# Execute
|
||||
result = self.runner.make_model_input_from_broadcasted_tensor_dict(
|
||||
self.sample_tensor_dict)
|
||||
|
||||
# Verify
|
||||
mock_from_dict.assert_called_once_with(self.sample_tensor_dict,
|
||||
attn_backend=self.attn_backend)
|
||||
self.assertEqual(result, expected_output)
|
||||
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
|
||||
def test_prepare_model_input_normal_case(self, mock_prepare_tensors,
|
||||
mock_prepare_pooling):
|
||||
"""Test normal flow of model input preparation"""
|
||||
# Setup mocks
|
||||
mock_model_input = ModelInputForNPUWithPoolingMetadata(
|
||||
seq_lens=[1, 2, 3])
|
||||
mock_prepare_tensors.return_value = mock_model_input
|
||||
|
||||
mock_pooling_metadata = MagicMock()
|
||||
mock_prepare_pooling.return_value = mock_pooling_metadata
|
||||
|
||||
# Execute
|
||||
result = self.runner.prepare_model_input(
|
||||
seq_group_metadata_list=self.sample_seq_group,
|
||||
finished_requests_ids=self.sample_finished_ids)
|
||||
|
||||
# Verify
|
||||
mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
|
||||
self.sample_finished_ids)
|
||||
mock_prepare_pooling.assert_called_once_with(self.sample_seq_group,
|
||||
mock_model_input.seq_lens)
|
||||
self.assertEqual(result.pooling_metadata, mock_pooling_metadata)
|
||||
|
||||
def test_prepare_model_input_null_sequence_group(self):
|
||||
"""Test assertion when seq_group_metadata_list is None"""
|
||||
with self.assertRaises(AssertionError):
|
||||
self.runner.prepare_model_input(
|
||||
seq_group_metadata_list=None,
|
||||
finished_requests_ids=self.sample_finished_ids)
|
||||
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
|
||||
def test_prepare_model_input_null_seq_lens(self, mock_prepare_tensors):
|
||||
"""Test assertion when seq_lens is None in model input"""
|
||||
# Setup mock with None seq_lens
|
||||
mock_model_input = MagicMock()
|
||||
mock_model_input.seq_lens = None
|
||||
mock_prepare_tensors.return_value = mock_model_input
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
self.runner.prepare_model_input(
|
||||
seq_group_metadata_list=self.sample_seq_group,
|
||||
finished_requests_ids=self.sample_finished_ids)
|
||||
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
|
||||
def test_prepare_model_input_with_virtual_engine(self,
|
||||
mock_prepare_tensors,
|
||||
mock_prepare_pooling):
|
||||
"""Test virtual engine parameter is properly handled"""
|
||||
# Setup mocks
|
||||
mock_model_input = ModelInputForNPUWithPoolingMetadata(
|
||||
seq_lens=[1, 2, 3])
|
||||
mock_prepare_tensors.return_value = mock_model_input
|
||||
|
||||
# Execute with virtual_engine parameter
|
||||
result = self.runner.prepare_model_input(
|
||||
seq_group_metadata_list=self.sample_seq_group,
|
||||
virtual_engine=1,
|
||||
finished_requests_ids=self.sample_finished_ids)
|
||||
|
||||
# Verify virtual_engine doesn't affect the flow
|
||||
self.assertIsNotNone(result)
|
||||
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
|
||||
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
|
||||
def test_prepare_model_input_with_null_finished_ids(
|
||||
self, mock_prepare_tensors, mock_prepare_pooling):
|
||||
"""Test case when finished_requests_ids is None"""
|
||||
# Setup mocks
|
||||
mock_model_input = ModelInputForNPUWithPoolingMetadata(
|
||||
seq_lens=[1, 2, 3])
|
||||
mock_prepare_tensors.return_value = mock_model_input
|
||||
|
||||
# Execute with None finished_ids
|
||||
result = self.runner.prepare_model_input(
|
||||
seq_group_metadata_list=self.sample_seq_group,
|
||||
finished_requests_ids=None)
|
||||
|
||||
# Verify
|
||||
mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
|
||||
None)
|
||||
self.assertIsNotNone(result)
|
||||
|
||||
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
|
||||
def test_prepare_pooling_normal_case(self, mock_pooling_metadata):
|
||||
"""Test normal case with multiple sequences in group"""
|
||||
# Setup test data
|
||||
mock_pooling_metadata.return_value = None
|
||||
seq_data = {
|
||||
1: MagicMock(spec=SequenceData),
|
||||
2: MagicMock(spec=SequenceData)
|
||||
}
|
||||
pooling_params = MagicMock(spec=PoolingParams)
|
||||
seq_group = MagicMock(spec=SequenceGroupMetadata)
|
||||
seq_group.seq_data = seq_data
|
||||
seq_group.pooling_params = pooling_params
|
||||
|
||||
# Call the function
|
||||
self.runner._prepare_pooling([seq_group], [10, 20])
|
||||
|
||||
# Verify results
|
||||
mock_pooling_metadata.assert_called_once_with(seq_groups=[
|
||||
([1, 2], pooling_params)
|
||||
],
|
||||
seq_data=seq_data,
|
||||
prompt_lens=[10, 20])
|
||||
|
||||
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
|
||||
def test_prepare_pooling_empty_group(self, mock_pooling_metadata):
|
||||
"""Test case with empty sequence group"""
|
||||
# Setup empty group
|
||||
mock_pooling_metadata.return_value = None
|
||||
empty_seq_data: dict[int, SequenceData] = {}
|
||||
pooling_params = MagicMock(spec=PoolingParams)
|
||||
empty_group = MagicMock(spec=SequenceGroupMetadata)
|
||||
empty_group.seq_data = empty_seq_data
|
||||
empty_group.pooling_params = pooling_params
|
||||
|
||||
# Call the function
|
||||
self.runner._prepare_pooling([empty_group], [])
|
||||
|
||||
# Verify results
|
||||
mock_pooling_metadata.assert_called_once_with(seq_groups=[
|
||||
([], pooling_params)
|
||||
],
|
||||
seq_data={},
|
||||
prompt_lens=[])
|
||||
|
||||
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
|
||||
def test_prepare_pooling_single_sequence(self, mock_pooling_metadata):
|
||||
"""Test case with single sequence in group"""
|
||||
# Setup single sequence
|
||||
mock_pooling_metadata.return_value = None
|
||||
single_seq_data = {3: MagicMock(spec=SequenceData)}
|
||||
pooling_params = MagicMock(spec=PoolingParams)
|
||||
single_group = MagicMock(spec=SequenceGroupMetadata)
|
||||
single_group.seq_data = single_seq_data
|
||||
single_group.pooling_params = pooling_params
|
||||
|
||||
# Call the function
|
||||
self.runner._prepare_pooling([single_group], [5])
|
||||
|
||||
# Verify results
|
||||
mock_pooling_metadata.assert_called_once_with(seq_groups=[
|
||||
([3], pooling_params)
|
||||
],
|
||||
seq_data=single_seq_data,
|
||||
prompt_lens=[5])
|
||||
|
||||
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
|
||||
def test_prepare_pooling_multiple_groups(self, mock_pooling_metadata):
|
||||
"""Test case with multiple sequence groups"""
|
||||
# Setup multiple groups
|
||||
mock_pooling_metadata.return_value = None
|
||||
seq_data1 = {1: MagicMock(spec=SequenceData)}
|
||||
seq_data2 = {2: MagicMock(spec=SequenceData)}
|
||||
params1 = MagicMock(spec=PoolingParams)
|
||||
params2 = MagicMock(spec=PoolingParams)
|
||||
|
||||
group1 = MagicMock(spec=SequenceGroupMetadata)
|
||||
group1.seq_data = seq_data1
|
||||
group1.pooling_params = params1
|
||||
|
||||
group2 = MagicMock(spec=SequenceGroupMetadata)
|
||||
group2.seq_data = seq_data2
|
||||
group2.pooling_params = params2
|
||||
|
||||
# Call the function
|
||||
self.runner._prepare_pooling([group1, group2], [10, 20])
|
||||
|
||||
# Verify results
|
||||
mock_pooling_metadata.assert_called_once_with(seq_groups=[
|
||||
([1], params1), ([2], params2)
|
||||
],
|
||||
seq_data={
|
||||
**seq_data1,
|
||||
**seq_data2
|
||||
},
|
||||
prompt_lens=[10, 20])
|
||||
|
||||
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
|
||||
def test_prepare_pooling_empty_input(self, mock_pooling_metadata):
|
||||
"""Test case with empty input lists"""
|
||||
# Call the function with empty inputs
|
||||
mock_pooling_metadata.return_value = None
|
||||
self.runner._prepare_pooling([], [])
|
||||
|
||||
# Verify results
|
||||
mock_pooling_metadata.assert_called_once_with(seq_groups=[],
|
||||
seq_data={},
|
||||
prompt_lens=[])
|
||||
|
||||
@patch('vllm.forward_context.set_forward_context')
|
||||
@patch('vllm.distributed.parallel_state._PP',
|
||||
new_callable=lambda: MagicMock(spec=GroupCoordinator,
|
||||
is_last_rank=True))
|
||||
@patch('torch.npu.Event')
|
||||
@patch.object(NPUPoolingModelRunner, 'set_active_loras')
|
||||
@patch.object(NPUPoolingModelRunner, 'set_active_prompt_adapters')
|
||||
def test_execute_model_normal_flow(self, mock_set_adapters, mock_set_loras,
|
||||
mock_event, mock_pp, mock_set_forward):
|
||||
"""Test normal execution path with all dependencies mocked"""
|
||||
|
||||
# Setup model input mock
|
||||
mock_input = MagicMock()
|
||||
mock_input.input_tokens = torch.tensor([1])
|
||||
mock_input.input_positions = torch.tensor([0])
|
||||
mock_input.multi_modal_kwargs = {}
|
||||
self.runner.is_driver_worker = True
|
||||
# Execute
|
||||
self.runner.execute_model(model_input=mock_input,
|
||||
kv_caches=[],
|
||||
num_steps=1)
|
||||
|
||||
# Verify core calls
|
||||
self.runner.model.pooler.assert_called_once()
|
||||
|
||||
@patch('vllm.forward_context.set_forward_context')
|
||||
def test_execute_model_invalid_steps(self, mock_set_forward):
|
||||
"""Test ValueError when num_steps != 1"""
|
||||
with self.assertRaises(ValueError):
|
||||
self.runner.execute_model(model_input=MagicMock(),
|
||||
kv_caches=[],
|
||||
num_steps=2)
|
||||
mock_set_forward.assert_not_called()
|
||||
|
||||
@patch('vllm.forward_context.set_forward_context')
|
||||
@patch('vllm.distributed.parallel_state._PP',
|
||||
new_callable=lambda: MagicMock(spec=GroupCoordinator,
|
||||
is_last_rank=False))
|
||||
@patch('torch.npu.Event')
|
||||
def test_execute_model_perf_monitoring(self, mock_event, mock_pp,
|
||||
mock_set_forward):
|
||||
"""Test performance monitoring with timing mocks"""
|
||||
# Setup mocks
|
||||
|
||||
mock_event.return_value.elapsed_time.return_value = 15.0
|
||||
self.runner.observability_config = MagicMock(
|
||||
collect_model_forward_time=True)
|
||||
|
||||
# Execute
|
||||
self.runner.execute_model(model_input=MagicMock(
|
||||
input_tokens=torch.tensor([1]),
|
||||
input_positions=torch.tensor([0]),
|
||||
multi_modal_kwargs={}),
|
||||
kv_caches=[],
|
||||
num_steps=1)
|
||||
|
||||
# Verify timing calls
|
||||
self.assertEqual(mock_event.call_count, 2)
|
||||
|
||||
@patch('vllm.forward_context.set_forward_context')
|
||||
@patch.object(NPUPoolingModelRunner, 'set_active_loras')
|
||||
@patch('vllm.distributed.parallel_state._PP',
|
||||
new_callable=lambda: MagicMock(spec=GroupCoordinator,
|
||||
is_last_rank=False))
|
||||
def test_execute_model_lora_config(self, mock_pp, set_active_loras,
|
||||
mock_set_forward):
|
||||
"""Test LoRA configuration handling"""
|
||||
# Setup
|
||||
|
||||
self.runner.lora_config = True
|
||||
mock_input = MagicMock()
|
||||
mock_input.lora_requests = ["req1"]
|
||||
mock_input.lora_mapping = {"map": 1}
|
||||
|
||||
# Execute
|
||||
self.runner.execute_model(model_input=mock_input,
|
||||
kv_caches=[],
|
||||
num_steps=1)
|
||||
|
||||
# Verify LoRA call
|
||||
set_active_loras.assert_called_once_with(["req1"], {"map": 1})
|
||||
|
||||
@patch('vllm.forward_context.set_forward_context')
|
||||
@patch('vllm.distributed.parallel_state._PP',
|
||||
new_callable=lambda: MagicMock(spec=GroupCoordinator,
|
||||
is_last_rank=False))
|
||||
def test_execute_model_not_last_rank(self, mock_pp, mock_set_forward):
|
||||
"""Test behavior when not the last pipeline rank"""
|
||||
# Setup
|
||||
|
||||
# Execute
|
||||
self.runner.execute_model(model_input=MagicMock(
|
||||
input_tokens=torch.tensor([1]),
|
||||
input_positions=torch.tensor([0]),
|
||||
multi_modal_kwargs={}),
|
||||
kv_caches=[],
|
||||
num_steps=1)
|
||||
|
||||
# Verify pooler not called
|
||||
self.runner.model.pooler.assert_not_called()
|
236
tests/utils.py
236
tests/utils.py
|
@ -1,236 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/tests/utils.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import functools
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from typing import Callable, Optional
|
||||
|
||||
import openai
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing_extensions import ParamSpec
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.utils import FlexibleArgumentParser, get_open_port
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
class RemoteOpenAIServer:
|
||||
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
||||
|
||||
def __init__(self,
|
||||
model: str,
|
||||
vllm_serve_args: list[str],
|
||||
*,
|
||||
env_dict: Optional[dict[str, str]] = None,
|
||||
seed: Optional[int] = 0,
|
||||
auto_port: bool = True,
|
||||
max_wait_seconds: Optional[float] = None) -> None:
|
||||
if auto_port:
|
||||
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
|
||||
raise ValueError("You have manually specified the port "
|
||||
"when `auto_port=True`.")
|
||||
|
||||
# Don't mutate the input args
|
||||
vllm_serve_args = vllm_serve_args + [
|
||||
"--port", str(get_open_port())
|
||||
]
|
||||
if seed is not None:
|
||||
if "--seed" in vllm_serve_args:
|
||||
raise ValueError("You have manually specified the seed "
|
||||
f"when `seed={seed}`.")
|
||||
|
||||
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM's remote OpenAI server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args(["--model", model, *vllm_serve_args])
|
||||
self.host = str(args.host or 'localhost')
|
||||
self.port = int(args.port)
|
||||
|
||||
self.show_hidden_metrics = \
|
||||
args.show_hidden_metrics_for_version is not None
|
||||
|
||||
# download the model before starting the server to avoid timeout
|
||||
is_local = os.path.isdir(model)
|
||||
if not is_local:
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
model_config = engine_args.create_model_config()
|
||||
load_config = engine_args.create_load_config()
|
||||
|
||||
model_loader = get_model_loader(load_config)
|
||||
model_loader.download_model(model_config)
|
||||
|
||||
env = os.environ.copy()
|
||||
# the current process might initialize cuda,
|
||||
# to be safe, we should use spawn method
|
||||
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
if env_dict is not None:
|
||||
env.update(env_dict)
|
||||
self.proc = subprocess.Popen(
|
||||
["vllm", "serve", model, *vllm_serve_args],
|
||||
env=env,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
)
|
||||
max_wait_seconds = max_wait_seconds or 240
|
||||
self._wait_for_server(url=self.url_for("health"),
|
||||
timeout=max_wait_seconds)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.proc.terminate()
|
||||
try:
|
||||
self.proc.wait(8)
|
||||
except subprocess.TimeoutExpired:
|
||||
# force kill if needed
|
||||
self.proc.kill()
|
||||
|
||||
def _wait_for_server(self, *, url: str, timeout: float):
|
||||
# run health check
|
||||
start = time.time()
|
||||
while True:
|
||||
try:
|
||||
if requests.get(url).status_code == 200:
|
||||
break
|
||||
except Exception:
|
||||
# this exception can only be raised by requests.get,
|
||||
# which means the server is not ready yet.
|
||||
# the stack trace is not useful, so we suppress it
|
||||
# by using `raise from None`.
|
||||
result = self.proc.poll()
|
||||
if result is not None and result != 0:
|
||||
raise RuntimeError("Server exited unexpectedly.") from None
|
||||
|
||||
time.sleep(0.5)
|
||||
if time.time() - start > timeout:
|
||||
raise RuntimeError(
|
||||
"Server failed to start in time.") from None
|
||||
|
||||
@property
|
||||
def url_root(self) -> str:
|
||||
return f"http://{self.host}:{self.port}"
|
||||
|
||||
def url_for(self, *parts: str) -> str:
|
||||
return self.url_root + "/" + "/".join(parts)
|
||||
|
||||
def get_client(self, **kwargs):
|
||||
if "timeout" not in kwargs:
|
||||
kwargs["timeout"] = 600
|
||||
return openai.OpenAI(
|
||||
base_url=self.url_for("v1"),
|
||||
api_key=self.DUMMY_API_KEY,
|
||||
max_retries=0,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_async_client(self, **kwargs):
|
||||
if "timeout" not in kwargs:
|
||||
kwargs["timeout"] = 600
|
||||
return openai.AsyncOpenAI(base_url=self.url_for("v1"),
|
||||
api_key=self.DUMMY_API_KEY,
|
||||
max_retries=0,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def fork_new_process_for_each_test(
|
||||
f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
"""Decorator to fork a new process for each test function.
|
||||
See https://github.com/vllm-project/vllm/issues/7053 for more details.
|
||||
"""
|
||||
|
||||
@functools.wraps(f)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||
# Make the process the leader of its own process group
|
||||
# to avoid sending SIGTERM to the parent process
|
||||
os.setpgrp()
|
||||
from _pytest.outcomes import Skipped
|
||||
pid = os.fork()
|
||||
print(f"Fork a new process to run a test {pid}")
|
||||
if pid == 0:
|
||||
try:
|
||||
f(*args, **kwargs)
|
||||
except Skipped as e:
|
||||
# convert Skipped to exit code 0
|
||||
print(str(e))
|
||||
os._exit(0)
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
os._exit(1)
|
||||
else:
|
||||
os._exit(0)
|
||||
else:
|
||||
pgid = os.getpgid(pid)
|
||||
_pid, _exitcode = os.waitpid(pid, 0)
|
||||
# ignore SIGTERM signal itself
|
||||
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
||||
# kill all child processes
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
# restore the signal handler
|
||||
signal.signal(signal.SIGTERM, old_signal_handler)
|
||||
assert _exitcode == 0, (f"function {f} failed when called with"
|
||||
f" args {args} and kwargs {kwargs}")
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
|
||||
tensor = torch.tensor(tensor)
|
||||
tensor = tensor[..., :dimensions]
|
||||
tensor = F.normalize(tensor, p=2, dim=1)
|
||||
return tensor
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
*,
|
||||
embeddings_0_lst: Sequence[list[float]],
|
||||
embeddings_1_lst: Sequence[list[float]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
tol: float = 1e-3,
|
||||
) -> None:
|
||||
assert len(embeddings_0_lst) == len(embeddings_1_lst)
|
||||
|
||||
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
|
||||
zip(embeddings_0_lst, embeddings_1_lst)):
|
||||
assert len(embeddings_0) == len(embeddings_1), (
|
||||
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
|
||||
|
||||
sim = F.cosine_similarity(torch.tensor(embeddings_0),
|
||||
torch.tensor(embeddings_1),
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\nCosine similarity: \t{sim:.4f}"
|
||||
f"\n{name_0}:\t{embeddings_0[:16]!r}"
|
||||
f"\n{name_1}:\t{embeddings_1[:16]!r}")
|
||||
|
||||
assert sim >= 1 - tol, fail_msg
|
|
@ -15,7 +15,6 @@
|
|||
# limitations under the License.
|
||||
from typing import Optional
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import logger
|
||||
|
||||
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
|
||||
|
@ -126,46 +125,36 @@ def get_ascend_config():
|
|||
def check_ascend_config(vllm_config, enforce_eager):
|
||||
ascend_config = get_ascend_config()
|
||||
|
||||
# for v0 engine
|
||||
if not envs.VLLM_USE_V1:
|
||||
# for eager mode
|
||||
if enforce_eager:
|
||||
# torchair_graph cannot be enabled with eager mode.
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
raise NotImplementedError(
|
||||
"Torchair graph mode is only supported for V1 Engine.")
|
||||
if ascend_config.ascend_scheduler_config.enabled:
|
||||
raise NotImplementedError(
|
||||
"Ascend scheduler is only supported for V1 Engine.")
|
||||
# for v1 engine
|
||||
raise RuntimeError(
|
||||
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
|
||||
)
|
||||
# for graph mode
|
||||
else:
|
||||
# for eager mode
|
||||
if enforce_eager:
|
||||
# torchair_graph cannot be enabled with eager mode.
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
raise RuntimeError(
|
||||
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
|
||||
)
|
||||
# for graph mode
|
||||
# torchair_graph case
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
# torchair_graph is supported for deepseek/pangu model only.
|
||||
if vllm_config.model_config:
|
||||
model_type = vllm_config.model_config.hf_config.model_type
|
||||
if not _check_torchair_supported(model_type):
|
||||
raise NotImplementedError(
|
||||
"Torchair graph mode only works with following model types:"
|
||||
f"{TORCHAIR_MODEL_LIST}.")
|
||||
# aclgraph case
|
||||
else:
|
||||
# torchair_graph case
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
# torchair_graph is supported for deepseek/pangu model only.
|
||||
if vllm_config.model_config:
|
||||
model_type = vllm_config.model_config.hf_config.model_type
|
||||
if not _check_torchair_supported(model_type):
|
||||
raise NotImplementedError(
|
||||
"Torchair graph mode only works with following model types:"
|
||||
f"{TORCHAIR_MODEL_LIST}.")
|
||||
# aclgraph case
|
||||
else:
|
||||
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
|
||||
if vllm_config.model_config:
|
||||
model_type = vllm_config.model_config.hf_config.model_type
|
||||
if "deepseek" in model_type:
|
||||
raise NotImplementedError(
|
||||
"ACL Graph does not support deepseek. Please "
|
||||
"try torchair graph mode to serve deepseek models on vllm-ascend."
|
||||
" Or set `enforce_eager=True` to use eager mode.")
|
||||
if "qwen" not in model_type:
|
||||
logger.warning(
|
||||
"ACL Graph is currently experimental. Please "
|
||||
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
|
||||
" if you encourage any Error")
|
||||
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
|
||||
if vllm_config.model_config:
|
||||
model_type = vllm_config.model_config.hf_config.model_type
|
||||
if "deepseek" in model_type:
|
||||
raise NotImplementedError(
|
||||
"ACL Graph does not support deepseek. Please "
|
||||
"try torchair graph mode to serve deepseek models on vllm-ascend."
|
||||
" Or set `enforce_eager=True` to use eager mode.")
|
||||
if "qwen" not in model_type:
|
||||
logger.warning(
|
||||
"ACL Graph is currently experimental. Please "
|
||||
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
|
||||
" if you encourage any Error")
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
|
|||
from vllm.utils import cdiv, round_down
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
|
||||
from vllm_ascend.multistream.context import get_multistream_comm_context
|
||||
|
@ -27,6 +26,8 @@ from vllm_ascend.worker.npu_input_batch import InputBatch
|
|||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommonAttentionMetadata:
|
||||
|
|
|
@ -51,8 +51,8 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
|||
"C_COMPILER":
|
||||
lambda: os.getenv("C_COMPILER", None),
|
||||
# The version of the Ascend chip. If not set, the default value is
|
||||
# ASCEND910B1. It's used for package building. Please make sure that the
|
||||
# version is correct.
|
||||
# ASCEND910B1(Available for A2 and A3 series). It's used for package building.
|
||||
# Please make sure that the version is correct.
|
||||
"SOC_VERSION":
|
||||
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
|
||||
# If set, vllm-ascend will print verbose logs during compilation
|
||||
|
|
|
@ -7,7 +7,7 @@ def register_model():
|
|||
from .deepseek_dbo import CustomDeepseekDBOForCausalLM # noqa: F401
|
||||
from .deepseek_mtp import CustomDeepSeekMTP # noqa: F401
|
||||
from .deepseek_v2 import CustomDeepseekV2ForCausalLM # noqa: F401
|
||||
from .deepseek_v2 import CustomDeepseekV3ForCausalLM # noqa: F401
|
||||
from .deepseek_v3 import CustomDeepseekV3ForCausalLM # noqa: F401
|
||||
from .qwen2_5_vl import \
|
||||
AscendQwen2_5_VLForConditionalGeneration # noqa: F401
|
||||
from .qwen2_vl import AscendQwen2VLForConditionalGeneration # noqa: F401
|
||||
|
@ -47,7 +47,7 @@ def register_model():
|
|||
|
||||
ModelRegistry.register_model(
|
||||
"DeepseekV3ForCausalLM",
|
||||
"vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")
|
||||
"vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM")
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"Qwen3MoeForCausalLM",
|
||||
|
|
|
@ -30,7 +30,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch_npu # noqa: F401
|
||||
import vllm.envs as envs
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
|
@ -397,20 +396,17 @@ class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention):
|
|||
hidden_states_or_q_c = hidden_states
|
||||
if self.torchair_graph_enabled:
|
||||
forward_kwargs = {}
|
||||
if envs.VLLM_USE_V1:
|
||||
output_shape = hidden_states.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=hidden_states_or_q_c.dtype,
|
||||
device=hidden_states_or_q_c.device)
|
||||
forward_kwargs['output'] = output
|
||||
|
||||
output_shape = hidden_states.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=hidden_states_or_q_c.dtype,
|
||||
device=hidden_states_or_q_c.device)
|
||||
forward_kwargs['output'] = output
|
||||
output = self.mla_attn.impl.forward(self.mla_attn,
|
||||
hidden_states_or_q_c,
|
||||
hidden_states, None, kv_cache,
|
||||
attn_metadata,
|
||||
**forward_kwargs)
|
||||
if envs.VLLM_USE_V1:
|
||||
output = output.view(-1, output_shape[-1])
|
||||
output = output.view(-1, output_shape[-1])
|
||||
return output
|
||||
else:
|
||||
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
||||
|
@ -885,7 +881,7 @@ class CustomDeepseekDBOModel(nn.Module):
|
|||
def can_run_ms(self):
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
# support mla attention and V1 engine at present
|
||||
if not self.use_mla or not envs.VLLM_USE_V1:
|
||||
if not self.use_mla:
|
||||
return False
|
||||
# enable prefill overlap
|
||||
if attn_metadata is None or attn_metadata.num_prefills == 0:
|
||||
|
|
|
@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|||
|
||||
import torch
|
||||
import torch_npu
|
||||
import vllm.envs as envs
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
|
@ -579,20 +578,17 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
|||
else:
|
||||
hidden_states_or_q_c = hidden_states
|
||||
if self.torchair_graph_enabled:
|
||||
if envs.VLLM_USE_V1:
|
||||
output_shape = hidden_states.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=hidden_states_or_q_c.dtype,
|
||||
device=hidden_states_or_q_c.device)
|
||||
forward_kwargs['output'] = output
|
||||
|
||||
output_shape = hidden_states.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=hidden_states_or_q_c.dtype,
|
||||
device=hidden_states_or_q_c.device)
|
||||
forward_kwargs['output'] = output
|
||||
output = self.mla_attn.impl.forward(self.mla_attn,
|
||||
hidden_states_or_q_c,
|
||||
hidden_states, None, kv_cache,
|
||||
attn_metadata,
|
||||
**forward_kwargs)
|
||||
if envs.VLLM_USE_V1:
|
||||
output = output.view(-1, output_shape[-1])
|
||||
output = output.view(-1, output_shape[-1])
|
||||
return output
|
||||
else:
|
||||
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
|
||||
|
@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
|
|||
prefix=f"{prefix}.mlp",
|
||||
)
|
||||
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
|
||||
and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
|
||||
and model_config.use_mla and self.tp_size > 1
|
||||
else:
|
||||
self.mlp = CustomDeepseekV2MLP(
|
||||
hidden_size=config.hidden_size,
|
||||
|
@ -983,7 +979,3 @@ class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
|
|||
attn_metadata, intermediate_tensors,
|
||||
inputs_embeds)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
|
||||
pass
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -13,20 +19,9 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/vllm/v1/pool/metadata.py
|
||||
#
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM
|
||||
|
||||
|
||||
@dataclass
|
||||
class PoolingMetadata:
|
||||
"""Tensors for pooling."""
|
||||
|
||||
prompt_lens: torch.Tensor
|
||||
prompt_token_ids: Optional[torch.Tensor]
|
||||
pooling_params: list[PoolingParams]
|
||||
class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
|
||||
pass
|
|
@ -18,25 +18,25 @@
|
|||
import torch
|
||||
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
|
||||
|
||||
from vllm_ascend.utils import is_310p
|
||||
|
||||
class AscendQuickGELU(QuickGELU):
|
||||
|
||||
def forward_oot(self, x: torch.tensor) -> torch.Tensor:
|
||||
import torch_npu
|
||||
|
||||
out = torch_npu.npu_fast_gelu(x)
|
||||
return out
|
||||
|
||||
|
||||
def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
|
||||
import torch_npu
|
||||
class AscendSiluAndMul(SiluAndMul):
|
||||
|
||||
if is_310p():
|
||||
out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
|
||||
else:
|
||||
out = torch_npu.npu_swiglu(x)
|
||||
return out
|
||||
def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
|
||||
import torch_npu
|
||||
|
||||
from vllm_ascend.utils import is_310p
|
||||
|
||||
def quick_gelu_forward_oot(self, x: torch.tensor) -> torch.Tensor:
|
||||
import torch_npu
|
||||
|
||||
out = torch_npu.npu_fast_gelu(x)
|
||||
return out
|
||||
|
||||
|
||||
QuickGELU.forward_oot = quick_gelu_forward_oot
|
||||
SiluAndMul.forward_oot = silu_and_mul_forward_oot
|
||||
if is_310p():
|
||||
out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
|
||||
else:
|
||||
out = torch_npu.npu_swiglu(x)
|
||||
return out
|
||||
|
|
|
@ -40,23 +40,26 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs):
|
|||
|
||||
|
||||
def forward_oot(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
router_logits: torch.Tensor,
|
||||
renormalize: bool,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
global_num_experts: Optional[int] = None,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
) -> torch.Tensor:
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
router_logits: torch.Tensor,
|
||||
renormalize: bool,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
global_num_experts: Optional[int] = None,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
enable_eplb: bool = False,
|
||||
expert_load_view: Optional[torch.Tensor] = None,
|
||||
logical_to_physical_map: Optional[torch.Tensor] = None,
|
||||
logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
|
||||
if SELECT_GATING_TOPK_SOTFMAX_EXPERTS:
|
||||
topk_weights, topk_ids = select_gating_top_k_softmax_experts(
|
||||
|
|
|
@ -73,38 +73,6 @@
|
|||
# Future Plan:
|
||||
# Keep this patch in vllm-ascend.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_multi_step_worker.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.spec_decode.multi_step_worker.MultiStepWorker.sampler_output`
|
||||
# Why:
|
||||
# There are cuda hard code (current_platform.is_cuda_alike()) in
|
||||
# `MultiStepWorker.sampler_output`, and we need to use the patched `TP1DraftModelRunner` in it.
|
||||
# How:
|
||||
# Make speculative decoding extensible to different backends.
|
||||
# - support attention metadata register to the set supported spec decode
|
||||
# - offer a api in platform to determine whether spec decode is supported,
|
||||
# and deprecate is_cuda_alike in it.
|
||||
# Related PR (if no, explain why):
|
||||
# - https://github.com/vllm-project/vllm/pull/15195
|
||||
# - https://github.com/vllm-project/vllm-ascend/pull/395
|
||||
# Future Plan:
|
||||
# Revert it when the related pr is merged in vllm and vllm-ascend.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_spec_decode_worker.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
|
||||
# Why:
|
||||
# We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`.
|
||||
# The mainly reason to overwrite `TP1DraftModelRunner`is the hard code of
|
||||
# `FlashAttentionMetadata`
|
||||
# How:
|
||||
# ditto
|
||||
# Related PR (if no, explain why):
|
||||
# - https://github.com/vllm-project/vllm/pull/15195
|
||||
# - https://github.com/vllm-project/vllm-ascend/pull/395
|
||||
# Future Plan:
|
||||
# Revert it when the related pr is merged in vllm and vllm-ascend.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_distributed.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.distributed.parallel_state.GroupCoordinator`
|
||||
|
|
|
@ -20,6 +20,4 @@
|
|||
import vllm_ascend.patch.worker.patch_common.patch_utils # noqa isort:skip
|
||||
import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_sampler # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_spec_decode_worker # noqa
|
||||
|
|
|
@ -1,91 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||
|
||||
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
|
||||
|
||||
|
||||
def sampler_output(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
sample_len: int,
|
||||
seq_ids_with_bonus_token_in_last_step: Set[int],
|
||||
) -> Tuple[List[SamplerOutput], bool]:
|
||||
"""Run the model forward pass sample_len times. Returns the list of
|
||||
sampler output, one per model forward pass, along with indicator of
|
||||
whether torch tensor in sampler output need to be transposed in latter
|
||||
sampler_output_to_torch logic.
|
||||
|
||||
For multi step worker, this indicator shall be True.
|
||||
"""
|
||||
self._raise_if_unsupported(execute_model_req)
|
||||
# Expand the batch for sequences with a bonus token.
|
||||
# Perform a forward pass on the expanded batch and filter the
|
||||
# response to retain only the original sequences' responses.
|
||||
expanded_request, indices_of_seq_with_bonus_tokens =\
|
||||
self._expand_execute_model_request(
|
||||
execute_model_req, seq_ids_with_bonus_token_in_last_step)
|
||||
|
||||
# Run model sample_len times.
|
||||
model_outputs: List[SamplerOutput] = []
|
||||
|
||||
# TODO: supports_gpu_multi_step is False in ASCEND
|
||||
if isinstance(self.model_runner, TP1DraftModelRunner) and \
|
||||
self.model_runner.supports_gpu_multi_step(expanded_request):
|
||||
# Here we run the draft_model_runner with multi-step prepare
|
||||
# on the GPU directly
|
||||
expanded_request.num_steps = sample_len
|
||||
self.model_runner.set_indices_of_seq_with_bonus_tokens(
|
||||
indices_of_seq_with_bonus_tokens)
|
||||
model_outputs = self.execute_model(execute_model_req=expanded_request)
|
||||
else:
|
||||
# Here we run multi-step directly, with every step prepared
|
||||
# on the CPU.
|
||||
# TODO Remove this branch once DraftModelRunner supports TP>1
|
||||
# and other restrictions that are part of DraftModelRunner's
|
||||
# supports_gpu_multi_step(..)
|
||||
if expanded_request.previous_hidden_states is not None:
|
||||
self.worker.model_runner.return_hidden_states = True
|
||||
for _ in range(sample_len):
|
||||
model_output: List[SamplerOutput] = self.worker.execute_model(
|
||||
execute_model_req=expanded_request)
|
||||
assert (len(model_output) == 1
|
||||
), "composing multistep workers not supported"
|
||||
model_output = model_output[0]
|
||||
self._maybe_update_previous_hidden_states(model_output,
|
||||
expanded_request)
|
||||
|
||||
self._append_new_tokens(model_output,
|
||||
expanded_request.seq_group_metadata_list,
|
||||
indices_of_seq_with_bonus_tokens)
|
||||
model_outputs.append(model_output)
|
||||
|
||||
# move indices to device to avoid stream sync
|
||||
indices_of_seq_with_bonus_tokens = torch.tensor(
|
||||
indices_of_seq_with_bonus_tokens, device=self.device)
|
||||
filtered_model_outputs = self._filter_model_output(
|
||||
model_outputs, indices_of_seq_with_bonus_tokens)
|
||||
return filtered_model_outputs, True
|
||||
|
||||
|
||||
MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output)
|
|
@ -1,157 +0,0 @@
|
|||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from vllm.model_executor.layers.spec_decode_base_sampler import \
|
||||
SpecDecodeBaseSampler
|
||||
from vllm.model_executor.layers.typical_acceptance_sampler import \
|
||||
TypicalAcceptanceSampler
|
||||
from vllm.spec_decode.medusa_worker import MedusaWorker
|
||||
from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker
|
||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||
from vllm.spec_decode.ngram_worker import NGramWorker
|
||||
from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker
|
||||
from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
|
||||
|
||||
|
||||
def create_worker(
|
||||
cls,
|
||||
scorer_worker: WorkerBase,
|
||||
draft_worker_kwargs: Dict[str, Any],
|
||||
disable_mqa_scorer: bool,
|
||||
disable_by_batch_size: Optional[int],
|
||||
draft_token_acceptance_method: str,
|
||||
typical_acceptance_sampler_posterior_threshold: float,
|
||||
typical_acceptance_sampler_posterior_alpha: float,
|
||||
disable_logprobs: bool,
|
||||
disable_log_stats: bool,
|
||||
num_speculative_tokens: int,
|
||||
) -> "SpecDecodeWorker":
|
||||
|
||||
allow_zero_draft_token_step = True
|
||||
enable_lm_head_weight_load = False
|
||||
num_spec_prefill_steps = 1
|
||||
ngram_prompt_lookup_max = (
|
||||
draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
|
||||
ngram_prompt_lookup_min = (
|
||||
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
|
||||
|
||||
draft_model_config = draft_worker_kwargs["vllm_config"].model_config
|
||||
draft_parallel_config: ParallelConfig = draft_worker_kwargs[
|
||||
'vllm_config'].parallel_config
|
||||
if ngram_prompt_lookup_max > 0:
|
||||
draft_worker_kwargs[
|
||||
"device_type"] = scorer_worker.device_config.device.type
|
||||
proposer_worker = NGramWorker(**draft_worker_kwargs)
|
||||
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
|
||||
ngram_prompt_lookup_max)
|
||||
else:
|
||||
# TODO(Yizhou): A quick fix, must be refactored ASAP
|
||||
# ngram need not this fix.
|
||||
draft_worker_kwargs[
|
||||
"vllm_config"].parallel_config.expert_parallel_size = 1
|
||||
draft_worker_kwargs[
|
||||
"vllm_config"].parallel_config.expert_tensor_parallel_size = 1
|
||||
|
||||
draft_tp = draft_parallel_config.tensor_parallel_size
|
||||
target_tp = scorer_worker.parallel_config.tensor_parallel_size
|
||||
|
||||
if draft_model_config.hf_config.model_type == "mlp_speculator":
|
||||
proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
|
||||
elif draft_model_config.hf_config.model_type == "medusa":
|
||||
proposer_worker = MedusaWorker(**draft_worker_kwargs)
|
||||
else:
|
||||
# Note: The current version of the MTP module doer not support
|
||||
# the use of TP1DraftModelRunner
|
||||
if draft_tp == 1 and draft_model_config.hf_config.model_type !=\
|
||||
"deepseek_mtp":
|
||||
draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner
|
||||
else:
|
||||
if draft_model_config.hf_config.model_type == "eagle":
|
||||
raise NotImplementedError(
|
||||
f"{draft_model_config.hf_config.model_type} "
|
||||
"does not support TP > 1 yet")
|
||||
|
||||
allow_zero_draft_token_step = False
|
||||
|
||||
# Load lm_head weight for eagle in init_device
|
||||
if draft_model_config.hf_config.model_type == "eagle":
|
||||
enable_lm_head_weight_load = True
|
||||
|
||||
proposer_worker = MultiStepWorker(**draft_worker_kwargs)
|
||||
if draft_model_config.hf_config.model_type == "deepseek_mtp":
|
||||
num_spec_prefill_steps = draft_model_config.hf_config.n_predict
|
||||
|
||||
proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
|
||||
proposer_worker, draft_tp, target_tp)
|
||||
|
||||
logger.info("Configuring SpecDecodeWorker with proposer=%s",
|
||||
type(proposer_worker))
|
||||
|
||||
spec_decode_sampler: SpecDecodeBaseSampler = None
|
||||
if draft_token_acceptance_method == "rejection_sampler":
|
||||
spec_decode_sampler = RejectionSampler()
|
||||
elif draft_token_acceptance_method == "typical_acceptance_sampler":
|
||||
spec_decode_sampler = TypicalAcceptanceSampler(
|
||||
posterior_threshold=\
|
||||
typical_acceptance_sampler_posterior_threshold,
|
||||
posterior_alpha=typical_acceptance_sampler_posterior_alpha,
|
||||
)
|
||||
logger.info(
|
||||
"[Speculative Decoding] Configuring"
|
||||
" SpecDecodeWorker with sampler=%s", type(spec_decode_sampler))
|
||||
|
||||
if not disable_mqa_scorer:
|
||||
if scorer_worker.model_runner.attn_backend.get_name() != "FLASH_ATTN":
|
||||
disable_mqa_scorer = True
|
||||
logger.info("[Speculative Decoding] Disabling MQA scorer as the "
|
||||
"MQA is only available with flash attn backend.")
|
||||
|
||||
if draft_model_config and \
|
||||
draft_model_config.max_model_len < \
|
||||
scorer_worker.model_config.max_model_len:
|
||||
disable_mqa_scorer = True
|
||||
logger.info("[Speculative Decoding] Disabling MQA scorer as the "
|
||||
"draft model max_model_len is smaller than the target "
|
||||
"model max_model_len.")
|
||||
|
||||
if not scorer_worker.model_runner.model_config.enforce_eager:
|
||||
disable_mqa_scorer = True
|
||||
logger.info("[Speculative Decoding] Disabling MQA scorer as the "
|
||||
"target model is not running in eager mode.")
|
||||
|
||||
return SpecDecodeWorker(
|
||||
proposer_worker,
|
||||
scorer_worker,
|
||||
disable_mqa_scorer=disable_mqa_scorer,
|
||||
disable_logprobs=disable_logprobs,
|
||||
disable_log_stats=disable_log_stats,
|
||||
disable_by_batch_size=disable_by_batch_size,
|
||||
spec_decode_sampler=spec_decode_sampler,
|
||||
allow_zero_draft_token_step=allow_zero_draft_token_step,
|
||||
enable_lm_head_weight_load=enable_lm_head_weight_load,
|
||||
num_spec_prefill_steps=num_spec_prefill_steps)
|
||||
|
||||
|
||||
SpecDecodeWorker.create_worker = classmethod(create_worker)
|
|
@ -16,7 +16,6 @@
|
|||
#
|
||||
|
||||
import gc
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
|
||||
|
@ -30,7 +29,7 @@ from vllm.platforms import Platform, PlatformEnum
|
|||
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
|
||||
init_ascend_config)
|
||||
from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p,
|
||||
update_aclgraph_sizes)
|
||||
register_ascend_customop, update_aclgraph_sizes)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
@ -117,6 +116,8 @@ class NPUPlatform(Platform):
|
|||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError("vLLM Ascend does not support V0 engine.")
|
||||
# initialize ascend config from vllm additional_config
|
||||
ascend_config = init_ascend_config(vllm_config)
|
||||
|
||||
|
@ -180,18 +181,7 @@ class NPUPlatform(Platform):
|
|||
update_aclgraph_sizes(vllm_config)
|
||||
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
||||
elif vllm_config.speculative_config:
|
||||
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
|
||||
# fault when using spec decode with V0 engine.
|
||||
os.environ["ACL_OP_INIT_MODE"] = "1"
|
||||
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
||||
elif vllm_config.scheduler_config.is_multi_step:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
|
||||
else:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
||||
|
||||
if cache_config:
|
||||
if cache_config.block_size is None:
|
||||
|
@ -202,34 +192,35 @@ class NPUPlatform(Platform):
|
|||
)
|
||||
cache_config.block_size = 128
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# Activate custom ops for v1, except on 310P
|
||||
if not is_310p():
|
||||
compilation_config.custom_ops = ["all"]
|
||||
# Activate custom ops for v1, except on 310P
|
||||
if not is_310p():
|
||||
compilation_config.custom_ops = ["all"]
|
||||
|
||||
# If ascend_scheduler_config is enabled,
|
||||
# extents original scheduler_config to use AscendScheduler.
|
||||
if ascend_config.ascend_scheduler_config.enabled:
|
||||
from vllm_ascend.core.schedule_config import \
|
||||
AscendSchedulerConfig
|
||||
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
||||
vllm_config.scheduler_config,
|
||||
ascend_config.ascend_scheduler_config)
|
||||
vllm_config.scheduler_config = ascend_scheduler_config
|
||||
# If ascend_scheduler_config is enabled,
|
||||
# extents original scheduler_config to use AscendScheduler.
|
||||
if ascend_config.ascend_scheduler_config.enabled:
|
||||
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
|
||||
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
||||
vllm_config.scheduler_config,
|
||||
ascend_config.ascend_scheduler_config)
|
||||
vllm_config.scheduler_config = ascend_scheduler_config
|
||||
|
||||
# register Ascend CustomOp
|
||||
register_ascend_customop()
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||
kv_cache_dtype, block_size, use_v1, use_mla):
|
||||
if use_v1 and use_mla:
|
||||
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
|
||||
if not use_v1:
|
||||
raise ValueError("vLLM Ascend does not support V0 engine.")
|
||||
|
||||
use_torchair = get_ascend_config().torchair_graph_config.enabled
|
||||
if use_v1 and use_torchair:
|
||||
return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
|
||||
if use_v1:
|
||||
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
|
||||
if use_mla:
|
||||
return "vllm_ascend.attention.attention.AscendMLAAttentionBackend"
|
||||
return "vllm_ascend.attention.attention.AscendAttentionBackend"
|
||||
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
|
||||
elif use_torchair:
|
||||
return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
|
||||
else:
|
||||
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue