Adding audio tools with whisper
Browse files- agents/llama_index_agent.py +16 -5
- tools/multimedia_tools.py +29 -93
agents/llama_index_agent.py
CHANGED
@@ -8,7 +8,9 @@ from typing import Optional, List, Any, Dict
|
|
8 |
from llama_index.llms.openai import OpenAI
|
9 |
from llama_index.llms.anthropic import Anthropic
|
10 |
# In your GaiaAgent class initialization, add these imports at the top
|
11 |
-
from tools.multimedia_tools import
|
|
|
|
|
12 |
|
13 |
from tools.web_tools import (
|
14 |
tavily_tool,
|
@@ -61,8 +63,7 @@ class GaiaAgent(ReActAgent):
|
|
61 |
wikipedia_tool.load_data,
|
62 |
wikipedia_tool.search_data,
|
63 |
tavily_tool.search,
|
64 |
-
|
65 |
-
get_audio_metadata
|
66 |
]
|
67 |
|
68 |
# Use default system prompt if not provided
|
@@ -174,11 +175,22 @@ class GaiaAgent(ReActAgent):
|
|
174 |
|
175 |
## SOLVING METHODOLOGY
|
176 |
1. For each question, thoroughly work through the reasoning step-by-step
|
177 |
-
2. Use available tools
|
|
|
|
|
|
|
|
|
178 |
3. Document your full analysis, including all key facts, calculations, and relevant information
|
179 |
4. Clearly identify what you believe the correct answer is
|
180 |
5. Be extremely explicit about the required formatting for the final answer
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
## DELEGATION TO WRITER AGENT
|
183 |
After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
|
184 |
- query: The original question
|
@@ -197,7 +209,6 @@ class GaiaAgent(ReActAgent):
|
|
197 |
IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
|
198 |
"""
|
199 |
|
200 |
-
|
201 |
def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
|
202 |
"""
|
203 |
Create a writer agent that formats final answers based on research notes.
|
|
|
8 |
from llama_index.llms.openai import OpenAI
|
9 |
from llama_index.llms.anthropic import Anthropic
|
10 |
# In your GaiaAgent class initialization, add these imports at the top
|
11 |
+
from tools.multimedia_tools import (
|
12 |
+
transcribe_audio_tool,
|
13 |
+
)
|
14 |
|
15 |
from tools.web_tools import (
|
16 |
tavily_tool,
|
|
|
63 |
wikipedia_tool.load_data,
|
64 |
wikipedia_tool.search_data,
|
65 |
tavily_tool.search,
|
66 |
+
transcribe_audio_tool,
|
|
|
67 |
]
|
68 |
|
69 |
# Use default system prompt if not provided
|
|
|
175 |
|
176 |
## SOLVING METHODOLOGY
|
177 |
1. For each question, thoroughly work through the reasoning step-by-step
|
178 |
+
2. Use available tools when needed:
|
179 |
+
- reverse_text_tool: For reversing text
|
180 |
+
- search tools (wikipedia_tool, tavily_tool): For finding information
|
181 |
+
- transcribe_audio: For transcribing audio files (provide the path to the audio file)
|
182 |
+
- get_audio_metadata: For getting metadata about audio files
|
183 |
3. Document your full analysis, including all key facts, calculations, and relevant information
|
184 |
4. Clearly identify what you believe the correct answer is
|
185 |
5. Be extremely explicit about the required formatting for the final answer
|
186 |
|
187 |
+
## HANDLING AUDIO TASKS
|
188 |
+
When dealing with audio files:
|
189 |
+
1. Use the transcribe_audio tool to get a full transcript of the audio content
|
190 |
+
2. Extract the specific information requested from the transcript
|
191 |
+
3. Format your answer exactly as requested in the question
|
192 |
+
4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
|
193 |
+
|
194 |
## DELEGATION TO WRITER AGENT
|
195 |
After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
|
196 |
- query: The original question
|
|
|
209 |
IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
|
210 |
"""
|
211 |
|
|
|
212 |
def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
|
213 |
"""
|
214 |
Create a writer agent that formats final answers based on research notes.
|
tools/multimedia_tools.py
CHANGED
@@ -1,114 +1,50 @@
|
|
1 |
import os
|
2 |
-
from typing import Optional,
|
3 |
-
from pathlib import Path
|
4 |
from llama_index.readers.whisper import WhisperReader
|
5 |
-
from llama_index.core.
|
6 |
|
7 |
-
|
8 |
-
|
|
|
9 |
|
10 |
def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
|
11 |
-
"""
|
12 |
-
|
13 |
-
|
14 |
-
Args:
|
15 |
-
model: The Whisper model to use
|
16 |
-
api_key: OpenAI API key (defaults to OPENAI_API_KEY environment variable)
|
17 |
-
"""
|
18 |
self.reader = WhisperReader(
|
19 |
-
model=model,
|
20 |
-
api_key=api_key
|
21 |
)
|
22 |
|
23 |
-
def transcribe(self,
|
24 |
"""
|
25 |
Transcribe an audio file to text.
|
26 |
|
27 |
Args:
|
28 |
-
|
29 |
-
|
30 |
-
Returns:
|
31 |
-
Transcribed text content
|
32 |
-
"""
|
33 |
-
try:
|
34 |
-
documents = self.reader.load_data(audio_path)
|
35 |
-
if not documents:
|
36 |
-
return "No transcription was generated."
|
37 |
-
|
38 |
-
# Return the combined text from all documents
|
39 |
-
return "\n\n".join([doc.text for doc in documents])
|
40 |
-
except Exception as e:
|
41 |
-
return f"Error transcribing audio: {str(e)}"
|
42 |
-
|
43 |
-
async def transcribe_async(self, audio_path: str) -> str:
|
44 |
-
"""
|
45 |
-
Transcribe an audio file to text asynchronously.
|
46 |
-
|
47 |
-
Args:
|
48 |
-
audio_path: Path to the audio file
|
49 |
|
50 |
Returns:
|
51 |
-
Transcribed text
|
52 |
"""
|
53 |
try:
|
54 |
-
|
55 |
-
|
56 |
-
return "No transcription was generated."
|
57 |
|
58 |
-
#
|
59 |
-
|
|
|
|
|
|
|
60 |
except Exception as e:
|
61 |
-
return f"Error transcribing audio: {str(e)}"
|
62 |
-
|
63 |
-
def get_metadata(self, audio_path: str) -> dict:
|
64 |
-
"""
|
65 |
-
Get metadata about an audio file.
|
66 |
-
|
67 |
-
Args:
|
68 |
-
audio_path: Path to the audio file
|
69 |
-
|
70 |
-
Returns:
|
71 |
-
Dictionary containing metadata
|
72 |
-
"""
|
73 |
-
path = Path(audio_path)
|
74 |
-
try:
|
75 |
-
# Basic file metadata
|
76 |
-
metadata = {
|
77 |
-
"filename": path.name,
|
78 |
-
"extension": path.suffix,
|
79 |
-
"size_bytes": path.stat().st_size if path.exists() else None,
|
80 |
-
"exists": path.exists(),
|
81 |
-
"is_file": path.is_file() if path.exists() else None,
|
82 |
-
}
|
83 |
-
return metadata
|
84 |
-
except Exception as e:
|
85 |
-
return {"error": str(e)}
|
86 |
|
87 |
|
88 |
-
#
|
89 |
-
|
90 |
|
91 |
-
#
|
92 |
-
|
93 |
-
""
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
audio_path: Path to the audio file
|
98 |
-
|
99 |
-
Returns:
|
100 |
-
Transcribed text content
|
101 |
-
"""
|
102 |
-
return whisper_tool.transcribe(audio_path)
|
103 |
-
|
104 |
-
def get_audio_metadata(audio_path: str) -> dict:
|
105 |
-
"""
|
106 |
-
Get metadata about an audio file.
|
107 |
-
|
108 |
-
Args:
|
109 |
-
audio_path: Path to the audio file
|
110 |
-
|
111 |
-
Returns:
|
112 |
-
Dictionary containing metadata
|
113 |
-
"""
|
114 |
-
return whisper_tool.get_metadata(audio_path)
|
|
|
1 |
import os
|
2 |
+
from typing import Optional, Dict, Any
|
|
|
3 |
from llama_index.readers.whisper import WhisperReader
|
4 |
+
from llama_index.core.tools import FunctionTool
|
5 |
|
6 |
+
|
7 |
+
class WhisperTranscriber:
|
8 |
+
"""Class for transcribing audio using OpenAI's Whisper model."""
|
9 |
|
10 |
def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
|
11 |
+
"""Initialize the WhisperTranscriber with specified model and API key."""
|
12 |
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
13 |
+
self.model = model
|
|
|
|
|
|
|
|
|
14 |
self.reader = WhisperReader(
|
15 |
+
model=self.model,
|
16 |
+
api_key=self.api_key,
|
17 |
)
|
18 |
|
19 |
+
def transcribe(self, audio_file_path: str) -> str:
|
20 |
"""
|
21 |
Transcribe an audio file to text.
|
22 |
|
23 |
Args:
|
24 |
+
audio_file_path: Path to the audio file (.mp3, .wav, etc.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
Returns:
|
27 |
+
Transcribed text from the audio file
|
28 |
"""
|
29 |
try:
|
30 |
+
# Load data from audio file
|
31 |
+
documents = self.reader.load_data(audio_file_path)
|
|
|
32 |
|
33 |
+
# Extract and concatenate text from all returned documents
|
34 |
+
if documents and len(documents) > 0:
|
35 |
+
transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
|
36 |
+
return transcription
|
37 |
+
return "No transcription was generated from the audio file."
|
38 |
except Exception as e:
|
39 |
+
return f"Error transcribing audio file: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
+
# Initialize the transcriber
|
43 |
+
whisper_transcriber = WhisperTranscriber()
|
44 |
|
45 |
+
# Create a function tool for audio transcription
|
46 |
+
transcribe_audio_tool = FunctionTool.from_defaults(
|
47 |
+
name="transcribe_audio",
|
48 |
+
description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
|
49 |
+
fn=whisper_transcriber.transcribe
|
50 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|