drAbreu commited on
Commit
09a77ad
·
1 Parent(s): f0544fd

Adding audio tools with whisper

Browse files
agents/llama_index_agent.py CHANGED
@@ -8,7 +8,9 @@ from typing import Optional, List, Any, Dict
8
  from llama_index.llms.openai import OpenAI
9
  from llama_index.llms.anthropic import Anthropic
10
  # In your GaiaAgent class initialization, add these imports at the top
11
- from tools.multimedia_tools import transcribe_audio, get_audio_metadata
 
 
12
 
13
  from tools.web_tools import (
14
  tavily_tool,
@@ -61,8 +63,7 @@ class GaiaAgent(ReActAgent):
61
  wikipedia_tool.load_data,
62
  wikipedia_tool.search_data,
63
  tavily_tool.search,
64
- transcribe_audio,
65
- get_audio_metadata
66
  ]
67
 
68
  # Use default system prompt if not provided
@@ -174,11 +175,22 @@ class GaiaAgent(ReActAgent):
174
 
175
  ## SOLVING METHODOLOGY
176
  1. For each question, thoroughly work through the reasoning step-by-step
177
- 2. Use available tools (reverse_text_tool, search tools) when needed
 
 
 
 
178
  3. Document your full analysis, including all key facts, calculations, and relevant information
179
  4. Clearly identify what you believe the correct answer is
180
  5. Be extremely explicit about the required formatting for the final answer
181
 
 
 
 
 
 
 
 
182
  ## DELEGATION TO WRITER AGENT
183
  After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
184
  - query: The original question
@@ -197,7 +209,6 @@ class GaiaAgent(ReActAgent):
197
  IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
198
  """
199
 
200
-
201
  def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
202
  """
203
  Create a writer agent that formats final answers based on research notes.
 
8
  from llama_index.llms.openai import OpenAI
9
  from llama_index.llms.anthropic import Anthropic
10
  # In your GaiaAgent class initialization, add these imports at the top
11
+ from tools.multimedia_tools import (
12
+ transcribe_audio_tool,
13
+ )
14
 
15
  from tools.web_tools import (
16
  tavily_tool,
 
63
  wikipedia_tool.load_data,
64
  wikipedia_tool.search_data,
65
  tavily_tool.search,
66
+ transcribe_audio_tool,
 
67
  ]
68
 
69
  # Use default system prompt if not provided
 
175
 
176
  ## SOLVING METHODOLOGY
177
  1. For each question, thoroughly work through the reasoning step-by-step
178
+ 2. Use available tools when needed:
179
+ - reverse_text_tool: For reversing text
180
+ - search tools (wikipedia_tool, tavily_tool): For finding information
181
+ - transcribe_audio: For transcribing audio files (provide the path to the audio file)
182
+ - get_audio_metadata: For getting metadata about audio files
183
  3. Document your full analysis, including all key facts, calculations, and relevant information
184
  4. Clearly identify what you believe the correct answer is
185
  5. Be extremely explicit about the required formatting for the final answer
186
 
187
+ ## HANDLING AUDIO TASKS
188
+ When dealing with audio files:
189
+ 1. Use the transcribe_audio tool to get a full transcript of the audio content
190
+ 2. Extract the specific information requested from the transcript
191
+ 3. Format your answer exactly as requested in the question
192
+ 4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
193
+
194
  ## DELEGATION TO WRITER AGENT
195
  After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
196
  - query: The original question
 
209
  IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
210
  """
211
 
 
212
  def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
213
  """
214
  Create a writer agent that formats final answers based on research notes.
tools/multimedia_tools.py CHANGED
@@ -1,114 +1,50 @@
1
  import os
2
- from typing import Optional, List, Tuple
3
- from pathlib import Path
4
  from llama_index.readers.whisper import WhisperReader
5
- from llama_index.core.schema import Document
6
 
7
- class WhisperTool:
8
- """Tool for transcribing audio files using OpenAI's Whisper model."""
 
9
 
10
  def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
11
- """
12
- Initialize the WhisperTool.
13
-
14
- Args:
15
- model: The Whisper model to use
16
- api_key: OpenAI API key (defaults to OPENAI_API_KEY environment variable)
17
- """
18
  self.reader = WhisperReader(
19
- model=model,
20
- api_key=api_key or os.getenv("OPENAI_API_KEY"),
21
  )
22
 
23
- def transcribe(self, audio_path: str) -> str:
24
  """
25
  Transcribe an audio file to text.
26
 
27
  Args:
28
- audio_path: Path to the audio file
29
-
30
- Returns:
31
- Transcribed text content
32
- """
33
- try:
34
- documents = self.reader.load_data(audio_path)
35
- if not documents:
36
- return "No transcription was generated."
37
-
38
- # Return the combined text from all documents
39
- return "\n\n".join([doc.text for doc in documents])
40
- except Exception as e:
41
- return f"Error transcribing audio: {str(e)}"
42
-
43
- async def transcribe_async(self, audio_path: str) -> str:
44
- """
45
- Transcribe an audio file to text asynchronously.
46
-
47
- Args:
48
- audio_path: Path to the audio file
49
 
50
  Returns:
51
- Transcribed text content
52
  """
53
  try:
54
- documents = await self.reader.aload_data(audio_path)
55
- if not documents:
56
- return "No transcription was generated."
57
 
58
- # Return the combined text from all documents
59
- return "\n\n".join([doc.text for doc in documents])
 
 
 
60
  except Exception as e:
61
- return f"Error transcribing audio: {str(e)}"
62
-
63
- def get_metadata(self, audio_path: str) -> dict:
64
- """
65
- Get metadata about an audio file.
66
-
67
- Args:
68
- audio_path: Path to the audio file
69
-
70
- Returns:
71
- Dictionary containing metadata
72
- """
73
- path = Path(audio_path)
74
- try:
75
- # Basic file metadata
76
- metadata = {
77
- "filename": path.name,
78
- "extension": path.suffix,
79
- "size_bytes": path.stat().st_size if path.exists() else None,
80
- "exists": path.exists(),
81
- "is_file": path.is_file() if path.exists() else None,
82
- }
83
- return metadata
84
- except Exception as e:
85
- return {"error": str(e)}
86
 
87
 
88
- # Create a singleton instance for use as a tool
89
- whisper_tool = WhisperTool()
90
 
91
- # Define tool functions that can be used directly with LlamaIndex
92
- def transcribe_audio(audio_path: str) -> str:
93
- """
94
- Transcribe an audio file to text.
95
-
96
- Args:
97
- audio_path: Path to the audio file
98
-
99
- Returns:
100
- Transcribed text content
101
- """
102
- return whisper_tool.transcribe(audio_path)
103
-
104
- def get_audio_metadata(audio_path: str) -> dict:
105
- """
106
- Get metadata about an audio file.
107
-
108
- Args:
109
- audio_path: Path to the audio file
110
-
111
- Returns:
112
- Dictionary containing metadata
113
- """
114
- return whisper_tool.get_metadata(audio_path)
 
1
  import os
2
+ from typing import Optional, Dict, Any
 
3
  from llama_index.readers.whisper import WhisperReader
4
+ from llama_index.core.tools import FunctionTool
5
 
6
+
7
+ class WhisperTranscriber:
8
+ """Class for transcribing audio using OpenAI's Whisper model."""
9
 
10
  def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
11
+ """Initialize the WhisperTranscriber with specified model and API key."""
12
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
13
+ self.model = model
 
 
 
 
14
  self.reader = WhisperReader(
15
+ model=self.model,
16
+ api_key=self.api_key,
17
  )
18
 
19
+ def transcribe(self, audio_file_path: str) -> str:
20
  """
21
  Transcribe an audio file to text.
22
 
23
  Args:
24
+ audio_file_path: Path to the audio file (.mp3, .wav, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  Returns:
27
+ Transcribed text from the audio file
28
  """
29
  try:
30
+ # Load data from audio file
31
+ documents = self.reader.load_data(audio_file_path)
 
32
 
33
+ # Extract and concatenate text from all returned documents
34
+ if documents and len(documents) > 0:
35
+ transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
36
+ return transcription
37
+ return "No transcription was generated from the audio file."
38
  except Exception as e:
39
+ return f"Error transcribing audio file: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
+ # Initialize the transcriber
43
+ whisper_transcriber = WhisperTranscriber()
44
 
45
+ # Create a function tool for audio transcription
46
+ transcribe_audio_tool = FunctionTool.from_defaults(
47
+ name="transcribe_audio",
48
+ description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
49
+ fn=whisper_transcriber.transcribe
50
+ )