diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 42a19f71e9..ce75297821 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ```console -python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run ```console -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \ --model --model meta-llama/Meta-Llama-3-8B-Instruct ``` +or use command-line: + +```console +vllm run-batch \ + -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + --model --model meta-llama/Meta-Llama-3-8B-Instruct +``` + ### Step 4: View your results Your results are now on S3. You can view them in your terminal by running diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 643d0d06ab..27802945a2 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -2,7 +2,6 @@ import json import subprocess -import sys import tempfile from vllm.entrypoints.openai.protocol import BatchRequestOutput @@ -35,9 +34,8 @@ def test_empty_file(): input_file.write("") input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -54,9 +52,8 @@ def test_completions(): input_file.write(INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -79,9 +76,8 @@ def test_completions_invalid_input(): input_file.write(INVALID_INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -95,9 +91,8 @@ def test_embeddings(): input_file.write(INPUT_EMBEDDING_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -117,9 +112,8 @@ def test_score(): input_file.write(INPUT_SCORE_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, - "-m", - "vllm.entrypoints.openai.run_batch", + "vllm", + "run-batch", "-i", input_file.name, "-o", diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 6676c294c8..5eba72fec1 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -7,6 +7,7 @@ import sys import vllm.entrypoints.cli.benchmark.main import vllm.entrypoints.cli.collect_env import vllm.entrypoints.cli.openai +import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve import vllm.version from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup @@ -17,6 +18,7 @@ CMD_MODULES = [ vllm.entrypoints.cli.serve, vllm.entrypoints.cli.benchmark.main, vllm.entrypoints.cli.collect_env, + vllm.entrypoints.cli.run_batch, ] diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py new file mode 100644 index 0000000000..f74c8da9b9 --- /dev/null +++ b/vllm/entrypoints/cli/run_batch.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio + +from prometheus_client import start_http_server + +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.logger import logger +from vllm.entrypoints.openai.run_batch import main as run_batch_main +from vllm.entrypoints.openai.run_batch import make_arg_parser +from vllm.utils import FlexibleArgumentParser +from vllm.version import __version__ as VLLM_VERSION + + +class RunBatchSubcommand(CLISubcommand): + """The `run-batch` subcommand for vLLM CLI.""" + + def __init__(self): + self.name = "run-batch" + super().__init__() + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + logger.info("vLLM batch processing API version %s", VLLM_VERSION) + logger.info("args: %s", args) + + # Start the Prometheus metrics server. + # LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + + asyncio.run(run_batch_main(args)) + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + run_batch_parser = subparsers.add_parser( + "run-batch", + help="Run batch prompts and write results to file.", + description=( + "Run batch prompts using vLLM's OpenAI-compatible API.\n" + "Supports local or HTTP input/output files."), + usage= + "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model ", + ) + return make_arg_parser(run_batch_parser) + + +def cmd_init() -> list[CLISubcommand]: + return [RunBatchSubcommand()] diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index eae83c9a49..f38465b22b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -33,9 +33,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION -def parse_args(): - parser = FlexibleArgumentParser( - description="vLLM OpenAI-Compatible batch runner.") +def make_arg_parser(parser: FlexibleArgumentParser): parser.add_argument( "-i", "--input-file", @@ -98,7 +96,13 @@ def parse_args(): default=False, help="If set to True, enable prompt_tokens_details in usage.") - return parser.parse_args() + return parser + + +def parse_args(): + parser = FlexibleArgumentParser( + description="vLLM OpenAI-Compatible batch runner.") + return make_arg_parser(parser).parse_args() # explicitly use pure text format, with a newline at the end