vllm/examples/online_serving/prompt_embed_inference_with...

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM OpenAI-Compatible Client with Prompt Embeddings

This script demonstrates how to:
1. Generate prompt embeddings using Hugging Face Transformers
2. Encode them in base64 format
3. Send them to a vLLM server via the OpenAI-compatible Completions API

Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
  --task generate \
  --max-model-len 4096 \
  --enable-prompt-embeds

Run the client:
python examples/online_serving/prompt_embed_inference_with_openai_client.py

Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
      You must request access to use it:
      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct

Dependencies:
- transformers
- torch
- openai
"""

import base64
import io

import torch
import transformers
from openai import OpenAI


def main():
    client = OpenAI(
        api_key="EMPTY",
        base_url="http://localhost:8000/v1",
    )

    model_name = "meta-llama/Llama-3.2-1B-Instruct"

    # Transformers
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)

    # Refer to the HuggingFace repo for the correct format to use
    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
    token_ids = tokenizer.apply_chat_template(
        chat, add_generation_prompt=True, return_tensors="pt"
    )

    embedding_layer = transformers_model.get_input_embeddings()
    prompt_embeds = embedding_layer(token_ids).squeeze(0)

    # Prompt embeddings
    buffer = io.BytesIO()
    torch.save(prompt_embeds, buffer)
    buffer.seek(0)
    binary_data = buffer.read()
    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")

    completion = client.completions.create(
        model=model_name,
        # NOTE: The OpenAI client does not allow `None` as an input to
        # `prompt`. Use an empty string if you have no text prompts.
        prompt="",
        max_tokens=5,
        temperature=0.0,
        # NOTE: The OpenAI client allows passing in extra JSON body via the
        # `extra_body` argument.
        extra_body={"prompt_embeds": encoded_embeds},
    )

    print("-" * 30)
    print(completion.choices[0].text)
    print("-" * 30)


if __name__ == "__main__":
    main()