[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-11-05 21:28:29 -07:00 · 2024-11-05 21:28:29 -07:00 · 2bcbae704c
parent ffc0f2b47a
commit 2bcbae704c
2 changed files with 12 additions and 5 deletions
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@ -10,19 +10,22 @@ from ...utils import check_logprobs_close

 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
-    # Mistral-Nemo is to big for CI, but passes locally
-    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

 MISTRAL_FORMAT_MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is to big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
    "勇敢な船乗りについての詩を書く",  # japanese
    "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
 ]

 # for function calling
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@ -254,7 +254,7 @@ class MistralTokenizer:
               skip_special_tokens: bool = True) -> str:
        assert (
            skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."

        if isinstance(ids, int):
            ids = [ids]
@ -268,12 +268,16 @@ class MistralTokenizer:
        # TODO(Patrick) - potentially allow special tokens to not be skipped
        assert (
            skip_special_tokens
-        ), "Skipping special tokens is not supported for Mistral tokenizers."
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."

        assert isinstance(self.tokenizer,
                          (Tekkenizer, SentencePieceTokenizer)), type(
                              self.tokenizer)

+        if isinstance(self.tokenizer, Tekkenizer):
+            # skip special tokens
+            ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
+
        tokens = [self.tokenizer.id_to_piece(id) for id in ids]

        if any("<EFBFBD>" in t for t in tokens):