vllm/examples/online_serving/retrieval_augmented_generat...

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
================================================================

This script demonstrates a RAG system using:
- LlamaIndex: For document indexing and retrieval
- Milvus: As vector store backend
- vLLM: For embedding and text generation

Features:
1. Document Loading & Processing
2. Embedding & Storage
3. Query Processing

Requirements:
1. Install dependencies:
pip install llama-index llama-index-readers-web \
            llama-index-llms-openai-like    \
            llama-index-embeddings-openai-like \
            llama-index-vector-stores-milvus \

2. Start services:
    # Start embedding service (port 8000)
    vllm serve ssmits/Qwen2-7B-Instruct-embed-base

    # Start chat service (port 8001)
    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001

Usage:
    python retrieval_augmented_generation_with_llamaindex.py

Notes:
    - Ensure both vLLM services are running before executing
    - Default ports: 8000 (embedding), 8001 (chat)
    - First run may take time to download models
"""

import argparse
from argparse import Namespace
from typing import Any

from llama_index.core import Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
from llama_index.llms.openai_like import OpenAILike
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.milvus import MilvusVectorStore


def init_config(args: Namespace):
    """Initialize configuration with command line arguments"""
    return {
        "url": args.url,
        "embedding_model": args.embedding_model,
        "chat_model": args.chat_model,
        "vllm_api_key": args.vllm_api_key,
        "embedding_endpoint": args.embedding_endpoint,
        "chat_endpoint": args.chat_endpoint,
        "db_path": args.db_path,
        "chunk_size": args.chunk_size,
        "chunk_overlap": args.chunk_overlap,
        "top_k": args.top_k,
    }


def load_documents(url: str) -> list:
    """Load and process web documents"""
    return SimpleWebPageReader(html_to_text=True).load_data([url])


def setup_models(config: dict[str, Any]):
    """Configure embedding and chat models"""
    Settings.embed_model = OpenAILikeEmbedding(
        api_base=config["embedding_endpoint"],
        api_key=config["vllm_api_key"],
        model_name=config["embedding_model"],
    )

    Settings.llm = OpenAILike(
        model=config["chat_model"],
        api_key=config["vllm_api_key"],
        api_base=config["chat_endpoint"],
        context_window=128000,
        is_chat_model=True,
        is_function_calling_model=False,
    )

    Settings.transformations = [
        SentenceSplitter(
            chunk_size=config["chunk_size"],
            chunk_overlap=config["chunk_overlap"],
        )
    ]


def setup_vector_store(db_path: str) -> MilvusVectorStore:
    """Initialize vector store"""
    sample_emb = Settings.embed_model.get_text_embedding("test")
    print(f"Embedding dimension: {len(sample_emb)}")
    return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)


def create_index(documents: list, vector_store: MilvusVectorStore):
    """Create document index"""
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
    )


def query_document(index: VectorStoreIndex, question: str, top_k: int):
    """Query document with given question"""
    query_engine = index.as_query_engine(similarity_top_k=top_k)
    return query_engine.query(question)


def get_parser() -> argparse.ArgumentParser:
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")

    # Add command line arguments
    parser.add_argument(
        "--url",
        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
        help="URL of the document to process",
    )
    parser.add_argument(
        "--embedding-model",
        default="ssmits/Qwen2-7B-Instruct-embed-base",
        help="Model name for embeddings",
    )
    parser.add_argument(
        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
    )
    parser.add_argument(
        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
    )
    parser.add_argument(
        "--embedding-endpoint",
        default="http://localhost:8000/v1",
        help="Base URL for embedding service",
    )
    parser.add_argument(
        "--chat-endpoint",
        default="http://localhost:8001/v1",
        help="Base URL for chat service",
    )
    parser.add_argument(
        "--db-path", default="./milvus_demo.db", help="Path to Milvus database"
    )
    parser.add_argument(
        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
    )
    parser.add_argument(
        "-c",
        "--chunk-size",
        type=int,
        default=1000,
        help="Chunk size for document splitting",
    )
    parser.add_argument(
        "-o",
        "--chunk-overlap",
        type=int,
        default=200,
        help="Chunk overlap for document splitting",
    )
    parser.add_argument(
        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
    )

    return parser


def main():
    # Parse command line arguments
    args = get_parser().parse_args()

    # Initialize configuration
    config = init_config(args)

    # Load documents
    documents = load_documents(config["url"])

    # Setup models
    setup_models(config)

    # Setup vector store
    vector_store = setup_vector_store(config["db_path"])

    # Create index
    index = create_index(documents, vector_store)

    if args.interactive:
        print("\nEntering interactive mode. Type 'quit' to exit.")
        while True:
            # Get user question
            question = input("\nEnter your question: ")

            # Check for exit command
            if question.lower() in ["quit", "exit", "q"]:
                print("Exiting interactive mode...")
                break

            # Get and print response
            print("\n" + "-" * 50)
            print("Response:\n")
            response = query_document(index, question, config["top_k"])
            print(response)
            print("-" * 50)
    else:
        # Single query mode
        question = "How to install vLLM?"
        response = query_document(index, question, config["top_k"])
        print("-" * 50)
        print("Response:\n")
        print(response)
        print("-" * 50)


if __name__ == "__main__":
    main()