vllm/tests/entrypoints/openai/test_translation_validation.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import io
# imports for guided decoding tests
import json
from unittest.mock import patch

import librosa
import numpy as np
import pytest
import soundfile as sf
from openai._base_client import AsyncAPIClient

from vllm.assets.audio import AudioAsset

from ...utils import RemoteOpenAIServer


@pytest.fixture
def foscolo():
    # Test translation it->en
    path = AudioAsset('azacinto_foscolo').get_local_path()
    with open(str(path), "rb") as f:
        yield f


# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo):
    model_name = "openai/whisper-small"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        translation = await client.audio.translations.create(
            model=model_name,
            file=foscolo,
            response_format="text",
            # TODO remove once language detection is implemented
            extra_body=dict(language="it"),
            temperature=0.0)
        out = json.loads(translation)['text'].strip()
        assert "Nor will I ever touch the sacred" in out


@pytest.mark.asyncio
async def test_audio_prompt(foscolo):
    model_name = "openai/whisper-small"
    server_args = ["--enforce-eager"]
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.translations.create(
            model=model_name,
            file=foscolo,
            prompt=prompt,
            extra_body=dict(language="it"),
            response_format="text",
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert "Nor will I ever touch the sacred" not in out
        assert prompt not in out


@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
    # text to text model
    model_name = "JackFram/llama-68m"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(model=model_name,
                                                     file=foscolo,
                                                     temperature=0.0)
        assert res.code == 400 and not res.text
        assert res.message == "The model does not support Translations API"


@pytest.mark.asyncio
async def test_streaming_response(foscolo):
    model_name = "openai/whisper-small"
    server_args = ["--enforce-eager"]
    translation = ""
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        res_no_stream = await client.audio.translations.create(
            model=model_name,
            file=foscolo,
            response_format="json",
            extra_body=dict(language="it"),
            temperature=0.0)
        # Unfortunately this only works when the openai client is patched
        # to use streaming mode, not exposed in the translation api.
        original_post = AsyncAPIClient.post

        async def post_with_stream(*args, **kwargs):
            kwargs['stream'] = True
            return await original_post(*args, **kwargs)

        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
            client = remote_server.get_async_client()
            res = await client.audio.translations.create(model=model_name,
                                                         file=foscolo,
                                                         temperature=0.0,
                                                         extra_body=dict(
                                                             stream=True,
                                                             language="it"))
            # Reconstruct from chunks and validate
            async for chunk in res:
                # just a chunk
                text = chunk.choices[0]['delta']['content']
                translation += text

        assert translation == res_no_stream.text


@pytest.mark.asyncio
async def test_stream_options(foscolo):
    model_name = "openai/whisper-small"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        original_post = AsyncAPIClient.post

        async def post_with_stream(*args, **kwargs):
            kwargs['stream'] = True
            return await original_post(*args, **kwargs)

        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
            client = remote_server.get_async_client()
            res = await client.audio.translations.create(
                model=model_name,
                file=foscolo,
                temperature=0.0,
                extra_body=dict(language="it",
                                stream=True,
                                stream_include_usage=True,
                                stream_continuous_usage_stats=True))
            final = False
            continuous = True
            async for chunk in res:
                if not len(chunk.choices):
                    # final usage sent
                    final = True
                else:
                    continuous = continuous and hasattr(chunk, 'usage')
            assert final and continuous


@pytest.mark.asyncio
async def test_long_audio_request(foscolo):
    model_name = "openai/whisper-small"
    server_args = ["--enforce-eager"]

    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
    # Repeated audio to buffer
    buffer = io.BytesIO()
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        translation = await client.audio.translations.create(
            model=model_name,
            file=buffer,
            extra_body=dict(language="it"),
            response_format="text",
            temperature=0.0)
        out = json.loads(translation)['text'].strip().lower()
        # TODO investigate higher model uncertainty in for longer translations.
        assert out.count("nor will i ever") == 2