drAbreu commited on
Commit
f0544fd
·
1 Parent(s): 09f2a63

Adding audio tools with whisper

Browse files
agents/llama_index_agent.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  from typing import Optional, List, Any, Dict
8
  from llama_index.llms.openai import OpenAI
9
  from llama_index.llms.anthropic import Anthropic
 
 
10
 
11
  from tools.web_tools import (
12
  tavily_tool,
@@ -58,7 +60,9 @@ class GaiaAgent(ReActAgent):
58
  reverse_text_tool,
59
  wikipedia_tool.load_data,
60
  wikipedia_tool.search_data,
61
- tavily_tool.search
 
 
62
  ]
63
 
64
  # Use default system prompt if not provided
 
7
  from typing import Optional, List, Any, Dict
8
  from llama_index.llms.openai import OpenAI
9
  from llama_index.llms.anthropic import Anthropic
10
+ # In your GaiaAgent class initialization, add these imports at the top
11
+ from tools.multimedia_tools import transcribe_audio, get_audio_metadata
12
 
13
  from tools.web_tools import (
14
  tavily_tool,
 
60
  reverse_text_tool,
61
  wikipedia_tool.load_data,
62
  wikipedia_tool.search_data,
63
+ tavily_tool.search,
64
+ transcribe_audio,
65
+ get_audio_metadata
66
  ]
67
 
68
  # Use default system prompt if not provided
requirements.txt CHANGED
@@ -4,4 +4,5 @@ llama-index
4
  llama-index-tools-wikipedia
5
  llama-index-tools-tavily-research
6
  llama-index-llms-anthropic
7
- llama-index-llms-openai
 
 
4
  llama-index-tools-wikipedia
5
  llama-index-tools-tavily-research
6
  llama-index-llms-anthropic
7
+ llama-index-llms-openai
8
+ llama-index-readers-whisper
tools/multimedia_tools.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, List, Tuple
3
+ from pathlib import Path
4
+ from llama_index.readers.whisper import WhisperReader
5
+ from llama_index.core.schema import Document
6
+
7
+ class WhisperTool:
8
+ """Tool for transcribing audio files using OpenAI's Whisper model."""
9
+
10
+ def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
11
+ """
12
+ Initialize the WhisperTool.
13
+
14
+ Args:
15
+ model: The Whisper model to use
16
+ api_key: OpenAI API key (defaults to OPENAI_API_KEY environment variable)
17
+ """
18
+ self.reader = WhisperReader(
19
+ model=model,
20
+ api_key=api_key or os.getenv("OPENAI_API_KEY"),
21
+ )
22
+
23
+ def transcribe(self, audio_path: str) -> str:
24
+ """
25
+ Transcribe an audio file to text.
26
+
27
+ Args:
28
+ audio_path: Path to the audio file
29
+
30
+ Returns:
31
+ Transcribed text content
32
+ """
33
+ try:
34
+ documents = self.reader.load_data(audio_path)
35
+ if not documents:
36
+ return "No transcription was generated."
37
+
38
+ # Return the combined text from all documents
39
+ return "\n\n".join([doc.text for doc in documents])
40
+ except Exception as e:
41
+ return f"Error transcribing audio: {str(e)}"
42
+
43
+ async def transcribe_async(self, audio_path: str) -> str:
44
+ """
45
+ Transcribe an audio file to text asynchronously.
46
+
47
+ Args:
48
+ audio_path: Path to the audio file
49
+
50
+ Returns:
51
+ Transcribed text content
52
+ """
53
+ try:
54
+ documents = await self.reader.aload_data(audio_path)
55
+ if not documents:
56
+ return "No transcription was generated."
57
+
58
+ # Return the combined text from all documents
59
+ return "\n\n".join([doc.text for doc in documents])
60
+ except Exception as e:
61
+ return f"Error transcribing audio: {str(e)}"
62
+
63
+ def get_metadata(self, audio_path: str) -> dict:
64
+ """
65
+ Get metadata about an audio file.
66
+
67
+ Args:
68
+ audio_path: Path to the audio file
69
+
70
+ Returns:
71
+ Dictionary containing metadata
72
+ """
73
+ path = Path(audio_path)
74
+ try:
75
+ # Basic file metadata
76
+ metadata = {
77
+ "filename": path.name,
78
+ "extension": path.suffix,
79
+ "size_bytes": path.stat().st_size if path.exists() else None,
80
+ "exists": path.exists(),
81
+ "is_file": path.is_file() if path.exists() else None,
82
+ }
83
+ return metadata
84
+ except Exception as e:
85
+ return {"error": str(e)}
86
+
87
+
88
+ # Create a singleton instance for use as a tool
89
+ whisper_tool = WhisperTool()
90
+
91
+ # Define tool functions that can be used directly with LlamaIndex
92
+ def transcribe_audio(audio_path: str) -> str:
93
+ """
94
+ Transcribe an audio file to text.
95
+
96
+ Args:
97
+ audio_path: Path to the audio file
98
+
99
+ Returns:
100
+ Transcribed text content
101
+ """
102
+ return whisper_tool.transcribe(audio_path)
103
+
104
+ def get_audio_metadata(audio_path: str) -> dict:
105
+ """
106
+ Get metadata about an audio file.
107
+
108
+ Args:
109
+ audio_path: Path to the audio file
110
+
111
+ Returns:
112
+ Dictionary containing metadata
113
+ """
114
+ return whisper_tool.get_metadata(audio_path)