mirror of https://github.com/vllm-project/vllm.git
98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Demonstrates how to generate prompt embeddings using
|
|
Hugging Face Transformers and use them as input to vLLM
|
|
for both single and batch inference.
|
|
|
|
Model: meta-llama/Llama-3.2-1B-Instruct
|
|
Note: This model is gated on Hugging Face Hub.
|
|
You must request access to use it:
|
|
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
|
|
|
|
Requirements:
|
|
- vLLM
|
|
- transformers
|
|
|
|
Run:
|
|
python examples/offline_inference/prompt_embed_inference.py
|
|
"""
|
|
|
|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
|
|
|
|
from vllm import LLM
|
|
|
|
|
|
def init_tokenizer_and_llm(model_name: str):
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
embedding_layer = transformers_model.get_input_embeddings()
|
|
llm = LLM(model=model_name, enable_prompt_embeds=True)
|
|
return tokenizer, embedding_layer, llm
|
|
|
|
|
|
def get_prompt_embeds(
|
|
chat: list[dict[str, str]],
|
|
tokenizer: PreTrainedTokenizer,
|
|
embedding_layer: torch.nn.Module,
|
|
):
|
|
token_ids = tokenizer.apply_chat_template(
|
|
chat, add_generation_prompt=True, return_tensors="pt"
|
|
)
|
|
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
|
return prompt_embeds
|
|
|
|
|
|
def single_prompt_inference(
|
|
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
|
|
):
|
|
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
|
|
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
|
|
|
outputs = llm.generate(
|
|
{
|
|
"prompt_embeds": prompt_embeds,
|
|
}
|
|
)
|
|
|
|
print("\n[Single Inference Output]")
|
|
print("-" * 30)
|
|
for o in outputs:
|
|
print(o.outputs[0].text)
|
|
print("-" * 30)
|
|
|
|
|
|
def batch_prompt_inference(
|
|
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
|
|
):
|
|
chats = [
|
|
[{"role": "user", "content": "Please tell me about the capital of France."}],
|
|
[{"role": "user", "content": "When is the day longest during the year?"}],
|
|
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
|
|
]
|
|
|
|
prompt_embeds_list = [
|
|
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
|
|
]
|
|
|
|
outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
|
|
|
|
print("\n[Batch Inference Outputs]")
|
|
print("-" * 30)
|
|
for i, o in enumerate(outputs):
|
|
print(f"Q{i + 1}: {chats[i][0]['content']}")
|
|
print(f"A{i + 1}: {o.outputs[0].text}\n")
|
|
print("-" * 30)
|
|
|
|
|
|
def main():
|
|
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
|
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
|
|
single_prompt_inference(llm, tokenizer, embedding_layer)
|
|
batch_prompt_inference(llm, tokenizer, embedding_layer)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|