[Frontend] add run batch to CLI (#18804)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
Reid 2025-05-28 22:08:57 +08:00 committed by GitHub
parent 4c2b38ce9e
commit 435fa95444
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 110 additions and 22 deletions

View File

@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl` You can run the batch with the following command, which will write its results to a file called `results.jsonl`
```console ```console
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
or use command-line:
```console
vllm run-batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
### Step 3: Check your results ### Step 3: Check your results
@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
```console ```console
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
or use command-line:
```console
vllm run-batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
## Example 3: Integrating with AWS S3 ## Example 3: Integrating with AWS S3
@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \
--model --model meta-llama/Meta-Llama-3-8B-Instruct --model --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
or use command-line:
```console
vllm run-batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
--model --model meta-llama/Meta-Llama-3-8B-Instruct
```
### Step 4: View your results ### Step 4: View your results
Your results are now on S3. You can view them in your terminal by running Your results are now on S3. You can view them in your terminal by running

View File

@ -2,7 +2,6 @@
import json import json
import subprocess import subprocess
import sys
import tempfile import tempfile
from vllm.entrypoints.openai.protocol import BatchRequestOutput from vllm.entrypoints.openai.protocol import BatchRequestOutput
@ -35,9 +34,8 @@ def test_empty_file():
input_file.write("") input_file.write("")
input_file.flush() input_file.flush()
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
input_file.name, "-o", output_file.name, "--model", "--model", "intfloat/multilingual-e5-small"
"intfloat/multilingual-e5-small"
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
@ -54,9 +52,8 @@ def test_completions():
input_file.write(INPUT_BATCH) input_file.write(INPUT_BATCH)
input_file.flush() input_file.flush()
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
input_file.name, "-o", output_file.name, "--model", "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
"NousResearch/Meta-Llama-3-8B-Instruct"
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
@ -79,9 +76,8 @@ def test_completions_invalid_input():
input_file.write(INVALID_INPUT_BATCH) input_file.write(INVALID_INPUT_BATCH)
input_file.flush() input_file.flush()
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
input_file.name, "-o", output_file.name, "--model", "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
"NousResearch/Meta-Llama-3-8B-Instruct"
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
@ -95,9 +91,8 @@ def test_embeddings():
input_file.write(INPUT_EMBEDDING_BATCH) input_file.write(INPUT_EMBEDDING_BATCH)
input_file.flush() input_file.flush()
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
input_file.name, "-o", output_file.name, "--model", "--model", "intfloat/multilingual-e5-small"
"intfloat/multilingual-e5-small"
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
@ -117,9 +112,8 @@ def test_score():
input_file.write(INPUT_SCORE_BATCH) input_file.write(INPUT_SCORE_BATCH)
input_file.flush() input_file.flush()
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "vllm",
"-m", "run-batch",
"vllm.entrypoints.openai.run_batch",
"-i", "-i",
input_file.name, input_file.name,
"-o", "-o",

View File

@ -7,6 +7,7 @@ import sys
import vllm.entrypoints.cli.benchmark.main import vllm.entrypoints.cli.benchmark.main
import vllm.entrypoints.cli.collect_env import vllm.entrypoints.cli.collect_env
import vllm.entrypoints.cli.openai import vllm.entrypoints.cli.openai
import vllm.entrypoints.cli.run_batch
import vllm.entrypoints.cli.serve import vllm.entrypoints.cli.serve
import vllm.version import vllm.version
from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
@ -17,6 +18,7 @@ CMD_MODULES = [
vllm.entrypoints.cli.serve, vllm.entrypoints.cli.serve,
vllm.entrypoints.cli.benchmark.main, vllm.entrypoints.cli.benchmark.main,
vllm.entrypoints.cli.collect_env, vllm.entrypoints.cli.collect_env,
vllm.entrypoints.cli.run_batch,
] ]

View File

@ -0,0 +1,55 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import asyncio
from prometheus_client import start_http_server
from vllm.entrypoints.cli.types import CLISubcommand
from vllm.entrypoints.logger import logger
from vllm.entrypoints.openai.run_batch import main as run_batch_main
from vllm.entrypoints.openai.run_batch import make_arg_parser
from vllm.utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
class RunBatchSubcommand(CLISubcommand):
"""The `run-batch` subcommand for vLLM CLI."""
def __init__(self):
self.name = "run-batch"
super().__init__()
@staticmethod
def cmd(args: argparse.Namespace) -> None:
logger.info("vLLM batch processing API version %s", VLLM_VERSION)
logger.info("args: %s", args)
# Start the Prometheus metrics server.
# LLMEngine uses the Prometheus client
# to publish metrics at the /metrics endpoint.
if args.enable_metrics:
logger.info("Prometheus metrics enabled")
start_http_server(port=args.port, addr=args.url)
else:
logger.info("Prometheus metrics disabled")
asyncio.run(run_batch_main(args))
def subparser_init(
self,
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
run_batch_parser = subparsers.add_parser(
"run-batch",
help="Run batch prompts and write results to file.",
description=(
"Run batch prompts using vLLM's OpenAI-compatible API.\n"
"Supports local or HTTP input/output files."),
usage=
"vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
)
return make_arg_parser(run_batch_parser)
def cmd_init() -> list[CLISubcommand]:
return [RunBatchSubcommand()]

View File

@ -33,9 +33,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
def parse_args(): def make_arg_parser(parser: FlexibleArgumentParser):
parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible batch runner.")
parser.add_argument( parser.add_argument(
"-i", "-i",
"--input-file", "--input-file",
@ -98,7 +96,13 @@ def parse_args():
default=False, default=False,
help="If set to True, enable prompt_tokens_details in usage.") help="If set to True, enable prompt_tokens_details in usage.")
return parser.parse_args() return parser
def parse_args():
parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible batch runner.")
return make_arg_parser(parser).parse_args()
# explicitly use pure text format, with a newline at the end # explicitly use pure text format, with a newline at the end