# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 from vllm import LLM model_name = "Qwen/Qwen3-Reranker-0.6B" # What is the difference between the official original version and one # that has been converted into a sequence classification model? # Qwen3-Reranker is a language model that doing reranker by using the # logits of "no" and "yes" tokens. # It needs to computing 151669 tokens logits, making this method extremely # inefficient, not to mention incompatible with the vllm score API. # A method for converting the original model into a sequence classification # model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 # Models converted offline using this method can not only be more efficient # and support the vllm score API, but also make the init parameters more # concise, for example. # model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") # If you want to load the official original version, the init parameters are # as follows. def get_model() -> LLM: """Initializes and returns the LLM model for Qwen3-Reranker.""" return LLM( model=model_name, task="score", hf_overrides={ "architectures": ["Qwen3ForSequenceClassification"], "classifier_from_token": ["no", "yes"], "is_original_qwen3_reranker": True, }, ) # Why do we need hf_overrides for the official original version: # vllm converts it to Qwen3ForSequenceClassification when loaded for # better performance. # - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],` # to manually route to Qwen3ForSequenceClassification. # - Then, we will extract the vector corresponding to classifier_from_token # from lm_head using `"classifier_from_token": ["no", "yes"]`. # - Third, we will convert these two vectors into one vector. The use of # conversion logic is controlled by `using "is_original_qwen3_reranker": True`. # Please use the query_template and document_template to format the query and # document for better reranker results. prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" query_template = "{prefix}: {instruction}\n: {query}\n" document_template = ": {doc}{suffix}" def main() -> None: instruction = ( "Given a web search query, retrieve relevant passages that answer the query" ) queries = [ "What is the capital of China?", "Explain gravity", ] documents = [ "The capital of China is Beijing.", "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", ] queries = [ query_template.format(prefix=prefix, instruction=instruction, query=query) for query in queries ] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] model = get_model() outputs = model.score(queries, documents) print("-" * 30) print([output.outputs.score for output in outputs]) print("-" * 30) if __name__ == "__main__": main()