mirror of https://github.com/dapr/dapr-agents.git
454 lines
12 KiB
Plaintext
454 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# LLM: OpenAI Audio Endpoint Basic Examples\n",
|
|
"\n",
|
|
"This notebook demonstrates how to use the `OpenAIAudioClient` in `dapr-agents` for basic tasks with the OpenAI Audio API. We will explore:\n",
|
|
"\n",
|
|
"* Generating speech from text and saving it as an MP3 file.\n",
|
|
"* Transcribing audio to text.\n",
|
|
"* Translating audio content to English."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Install Required Libraries\n",
|
|
"\n",
|
|
"Ensure you have the required library installed:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install dapr-agents python-dotenv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load Environment Variables"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from dotenv import load_dotenv\n",
|
|
"load_dotenv()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Initialize OpenAIAudioClient"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from dapr_agents import OpenAIAudioClient\n",
|
|
"\n",
|
|
"client = OpenAIAudioClient()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Generate Speech from Text\n",
|
|
"\n",
|
|
"### Manual File Creation\n",
|
|
"\n",
|
|
"This section demonstrates how to generate speech from a given text input and save it as an MP3 file."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Audio saved to output_speech.mp3\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from dapr_agents.types.llm import AudioSpeechRequest\n",
|
|
"\n",
|
|
"# Define the text to convert to speech\n",
|
|
"text_to_speech = \"Hello Roberto! This is an example of text-to-speech generation.\"\n",
|
|
"\n",
|
|
"# Create a request for TTS\n",
|
|
"tts_request = AudioSpeechRequest(\n",
|
|
" model=\"tts-1\",\n",
|
|
" input=text_to_speech,\n",
|
|
" voice=\"fable\",\n",
|
|
" response_format=\"mp3\"\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate the audio\n",
|
|
"audio_bytes = client.create_speech(request=tts_request)\n",
|
|
"\n",
|
|
"# Save the audio to an MP3 file\n",
|
|
"output_path = \"output_speech.mp3\"\n",
|
|
"with open(output_path, \"wb\") as audio_file:\n",
|
|
" audio_file.write(audio_bytes)\n",
|
|
"\n",
|
|
"print(f\"Audio saved to {output_path}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Automatic File Creation\n",
|
|
"\n",
|
|
"The audio file is saved directly by providing the file_name parameter."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from dapr_agents.types.llm import AudioSpeechRequest\n",
|
|
"\n",
|
|
"# Define the text to convert to speech\n",
|
|
"text_to_speech = \"Hola Roberto! Este es otro ejemplo de generacion de voz desde texto.\"\n",
|
|
"\n",
|
|
"# Create a request for TTS\n",
|
|
"tts_request = AudioSpeechRequest(\n",
|
|
" model=\"tts-1\",\n",
|
|
" input=text_to_speech,\n",
|
|
" voice=\"echo\",\n",
|
|
" response_format=\"mp3\"\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate the audio\n",
|
|
"client.create_speech(request=tts_request, file_name=\"output_speech_spanish_auto.mp3\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Transcribe Audio to Text\n",
|
|
"\n",
|
|
"This section demonstrates how to transcribe audio content into text."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using a File Path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Transcription: Hello Roberto, this is an example of text-to-speech generation.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from dapr_agents.types.llm import AudioTranscriptionRequest\n",
|
|
"\n",
|
|
"# Specify the audio file to transcribe\n",
|
|
"audio_file_path = \"output_speech.mp3\"\n",
|
|
"\n",
|
|
"# Create a transcription request\n",
|
|
"transcription_request = AudioTranscriptionRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=audio_file_path\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate transcription\n",
|
|
"transcription_response = client.create_transcription(request=transcription_request)\n",
|
|
"\n",
|
|
"# Display the transcription result\n",
|
|
"print(\"Transcription:\", transcription_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using Audio Bytes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Transcription: Hola Roberto, este es otro ejemplo de generación de voz desde texto.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# audio_bytes = open(\"output_speech_spanish_auto.mp3\", \"rb\")\n",
|
|
"\n",
|
|
"with open(\"output_speech_spanish_auto.mp3\", \"rb\") as f:\n",
|
|
" audio_bytes = f.read()\n",
|
|
"\n",
|
|
"transcription_request = AudioTranscriptionRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=audio_bytes, # File as bytes\n",
|
|
" language=\"en\" # Optional: Specify the language of the audio\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate transcription\n",
|
|
"transcription_response = client.create_transcription(request=transcription_request)\n",
|
|
"\n",
|
|
"# Display the transcription result\n",
|
|
"print(\"Transcription:\", transcription_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using File-Like Objects (e.g., BufferedReader)\n",
|
|
"\n",
|
|
"You can use file-like objects, such as BufferedReader, directly for transcription or translation."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Transcription: ¡Hola, Roberto! Este es otro ejemplo de generación de voz desde texto.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from io import BufferedReader\n",
|
|
"\n",
|
|
"# Open the audio file as a BufferedReader\n",
|
|
"audio_file_path = \"output_speech_spanish_auto.mp3\"\n",
|
|
"with open(audio_file_path, \"rb\") as f:\n",
|
|
" buffered_file = BufferedReader(f)\n",
|
|
"\n",
|
|
" # Create a transcription request\n",
|
|
" transcription_request = AudioTranscriptionRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=buffered_file, # File as BufferedReader\n",
|
|
" language=\"es\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Generate transcription\n",
|
|
" transcription_response = client.create_transcription(request=transcription_request)\n",
|
|
"\n",
|
|
" # Display the transcription result\n",
|
|
" print(\"Transcription:\", transcription_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Translate Audio to English\n",
|
|
"\n",
|
|
"This section demonstrates how to translate audio content into English."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using a File Path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Translation: Hola Roberto, este es otro ejemplo de generación de voz desde texto.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from dapr_agents.types.llm import AudioTranslationRequest\n",
|
|
"\n",
|
|
"# Specify the audio file to translate\n",
|
|
"audio_file_path = \"output_speech_spanish_auto.mp3\"\n",
|
|
"\n",
|
|
"# Create a translation request\n",
|
|
"translation_request = AudioTranslationRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=audio_file_path,\n",
|
|
" prompt=\"The following audio needs to be translated to English.\"\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate translation\n",
|
|
"translation_response = client.create_translation(request=translation_request)\n",
|
|
"\n",
|
|
"# Display the translation result\n",
|
|
"print(\"Translation:\", translation_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using Audio Bytes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Translation: Hola Roberto, este es otro ejemplo de generación de voz desde texto.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# audio_bytes = open(\"output_speech_spanish_auto.mp3\", \"rb\")\n",
|
|
"\n",
|
|
"with open(\"output_speech_spanish_auto.mp3\", \"rb\") as f:\n",
|
|
" audio_bytes = f.read()\n",
|
|
"\n",
|
|
"translation_request = AudioTranslationRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=audio_bytes, # File as bytes\n",
|
|
" prompt=\"The following audio needs to be translated to English.\"\n",
|
|
")\n",
|
|
"\n",
|
|
"# Generate translation\n",
|
|
"translation_response = client.create_translation(request=translation_request)\n",
|
|
"\n",
|
|
"# Display the translation result\n",
|
|
"print(\"Translation:\", translation_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using File-Like Objects (e.g., BufferedReader) for Translation\n",
|
|
"\n",
|
|
"You can use a file-like object, such as a BufferedReader, directly for translating audio content."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Translation: Hola Roberto, este es otro ejemplo de generación de voz desde texto.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from io import BufferedReader\n",
|
|
"\n",
|
|
"# Open the audio file as a BufferedReader\n",
|
|
"audio_file_path = \"output_speech_spanish_auto.mp3\"\n",
|
|
"with open(audio_file_path, \"rb\") as f:\n",
|
|
" buffered_file = BufferedReader(f)\n",
|
|
"\n",
|
|
" # Create a translation request\n",
|
|
" translation_request = AudioTranslationRequest(\n",
|
|
" model=\"whisper-1\",\n",
|
|
" file=buffered_file, # File as BufferedReader\n",
|
|
" prompt=\"The following audio needs to be translated to English.\"\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Generate translation\n",
|
|
" translation_response = client.create_translation(request=translation_request)\n",
|
|
"\n",
|
|
" # Display the translation result\n",
|
|
" print(\"Translation:\", translation_response.text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|