drAbreu commited on
Commit
56a4634
·
1 Parent(s): ab81a57

Added vision analyzer tool / agent

Browse files
agents/llama_index_agent.py CHANGED
@@ -11,6 +11,8 @@ from llama_index.llms.anthropic import Anthropic
11
  # In your GaiaAgent class initialization, add these imports at the top
12
  from tools.multimedia_tools import (
13
  transcribe_audio_tool,
 
 
14
  )
15
 
16
  from tools.web_tools import (
@@ -72,7 +74,9 @@ class GaiaAgent(ReActAgent):
72
  tavily_tool.search,
73
  transcribe_audio_tool,
74
  execute_python_file_tool,
75
- csv_excel_reader_tool
 
 
76
  ]
77
 
78
  # Use default system prompt if not provided
@@ -158,6 +162,21 @@ class GaiaAgent(ReActAgent):
158
  3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
159
  4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  ## HANDLING CSV OR EXCEL DATA TASKS
162
  When dealing with CSV files or data analysis tasks:
163
  1. Check if a CSV file path is mentioned in the question or available in the context
 
11
  # In your GaiaAgent class initialization, add these imports at the top
12
  from tools.multimedia_tools import (
13
  transcribe_audio_tool,
14
+ encode_image_tool,
15
+ vision_analyzer_tool
16
  )
17
 
18
  from tools.web_tools import (
 
74
  tavily_tool.search,
75
  transcribe_audio_tool,
76
  execute_python_file_tool,
77
+ csv_excel_reader_tool,
78
+ encode_image_tool,
79
+ vision_analyzer_tool
80
  ]
81
 
82
  # Use default system prompt if not provided
 
162
  3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
163
  4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
164
 
165
+ ## HANDLING IMAGE ANALYSIS TASKS
166
+ When dealing with image files for visual analysis:
167
+ 1. First, check if an image file path is mentioned in the question or available in the context
168
+ 2. For image analysis, follow this two-step process:
169
+ a. Use the encode_image_to_base64 tool to convert the image to a base64 string
170
+ b. Pass the image path and a specific analysis question to analyze_image_with_vision
171
+ 3. The vision analyzer can perform various visual analysis tasks:
172
+ - General image description: "Describe this image in detail"
173
+ - Specific information extraction: "What text appears in this image?"
174
+ - Visual problem solving: "How many people are in this image?"
175
+ - Object identification: "What brands/products are visible in this image?"
176
+ 4. Be specific in your analysis requests to get the most relevant information
177
+ 5. For tasks that require both text extraction and visual analysis, prioritize using the vision analyzer
178
+ 6. Always document your analysis and include relevant details in your notes to the writer_agent
179
+
180
  ## HANDLING CSV OR EXCEL DATA TASKS
181
  When dealing with CSV files or data analysis tasks:
182
  1. Check if a CSV file path is mentioned in the question or available in the context
requirements.txt CHANGED
@@ -7,4 +7,5 @@ llama-index-llms-anthropic
7
  llama-index-llms-openai
8
  llama-index-readers-whisper
9
  llama-index-readers-file
10
- openpyxl
 
 
7
  llama-index-llms-openai
8
  llama-index-readers-whisper
9
  llama-index-readers-file
10
+ openpyxl
11
+ Pillow
tools/multimedia_tools.py CHANGED
@@ -4,29 +4,14 @@ from llama_index.readers.whisper import WhisperReader
4
  from llama_index.core.tools import FunctionTool
5
  from llama_index.core import SimpleDirectoryReader
6
  from llama_index.readers.file import (
7
- DocxReader,
8
- HWPReader,
9
- PDFReader,
10
- EpubReader,
11
- FlatReader,
12
- HTMLTagReader,
13
- ImageCaptionReader,
14
- ImageReader,
15
- ImageVisionLLMReader,
16
- IPYNBReader,
17
- MarkdownReader,
18
- MboxReader,
19
- PptxReader,
20
- PandasCSVReader,
21
- VideoAudioReader,
22
- UnstructuredReader,
23
- PyMuPDFReader,
24
- ImageTabularChartReader,
25
- XMLReader,
26
- PagedCSVReader,
27
- CSVReader,
28
- RTFReader,
29
  )
 
 
 
 
 
 
30
 
31
  class WhisperTranscriber:
32
  """Class for transcribing audio using OpenAI's Whisper model."""
@@ -71,4 +56,199 @@ transcribe_audio_tool = FunctionTool.from_defaults(
71
  name="transcribe_audio",
72
  description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
73
  fn=whisper_transcriber.transcribe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  )
 
4
  from llama_index.core.tools import FunctionTool
5
  from llama_index.core import SimpleDirectoryReader
6
  from llama_index.readers.file import (
7
+ ImageReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  )
9
+ import base64
10
+ import sys
11
+ import traceback
12
+ from PIL import Image
13
+ from llama_index.llms.openai import OpenAI
14
+ from llama_index.llms.anthropic import Anthropic
15
 
16
  class WhisperTranscriber:
17
  """Class for transcribing audio using OpenAI's Whisper model."""
 
56
  name="transcribe_audio",
57
  description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
58
  fn=whisper_transcriber.transcribe
59
+ )
60
+
61
+
62
+ def encode_image_to_base64(file_path: str) -> str:
63
+ """
64
+ Reads an image file and encodes it to a base64 string.
65
+
66
+ This function focuses exclusively on generating a base64 encoded string from an image file.
67
+
68
+ Args:
69
+ file_path (str): Path to the image file to be encoded
70
+
71
+ Returns:
72
+ str: The base64 encoded string of the image
73
+
74
+ Raises:
75
+ FileNotFoundError: If the specified file doesn't exist
76
+ ValueError: If the file has an unsupported extension
77
+
78
+ Examples:
79
+ >>> base64_data = encode_image_to_base64("data/photo.jpg")
80
+ """
81
+ # Check if file exists
82
+ if not os.path.exists(file_path):
83
+ raise FileNotFoundError(f"File not found at {file_path}")
84
+
85
+ # Get file extension
86
+ file_ext = os.path.splitext(file_path)[1].lower()
87
+ supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']
88
+
89
+ if file_ext not in supported_formats:
90
+ raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}")
91
+
92
+ with open(file_path, "rb") as image_file:
93
+ encoded_string = base64.b64encode(image_file.read())
94
+ base64_image = encoded_string.decode('utf-8')
95
+
96
+ return base64_image
97
+
98
+ # Create a function tool for image encoding
99
+ encode_image_tool = FunctionTool.from_defaults(
100
+ name="encode_image_to_base64",
101
+ description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.",
102
+ fn=encode_image_to_base64
103
+ )
104
+
105
+ class VisionAnalyzerAgent:
106
+ """
107
+ A specialized agent for analyzing images using vision models.
108
+
109
+ This agent can process images, analyze their content, and provide detailed descriptions
110
+ or answer questions about the visual elements.
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ model_provider: str = "openai",
116
+ model_name: str = "gpt-4o",
117
+ api_key: Optional[str] = None,
118
+ **kwargs
119
+ ):
120
+ """
121
+ Initialize a VisionAnalyzerAgent.
122
+
123
+ Args:
124
+ model_provider: The LLM provider to use ("anthropic" or "openai")
125
+ model_name: The specific model name to use
126
+ api_key: API key for the provider (defaults to environment variable)
127
+ **kwargs: Additional parameters for the model
128
+ """
129
+ self.model_provider = model_provider.lower()
130
+ self.model_name = model_name
131
+ self.api_key = api_key
132
+
133
+ # Set up the vision model client
134
+ if self.model_provider == "anthropic":
135
+ self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY"))
136
+ elif self.model_provider == "openai":
137
+ self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
138
+ else:
139
+ raise ValueError(f"Unsupported model provider: {model_provider}. "
140
+ f"Supported providers are: anthropic, openai")
141
+
142
+ def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str:
143
+ """
144
+ Analyze an image using the vision model.
145
+
146
+ Args:
147
+ image_base64: Base64 encoded image data
148
+ query: The question or instruction for image analysis
149
+
150
+ Returns:
151
+ str: The analysis result from the vision model
152
+ """
153
+ # Prepare the image for the appropriate model
154
+ if self.model_provider == "anthropic":
155
+ # Handle Anthropic Claude models
156
+ try:
157
+ # Determine MIME type based on image data
158
+ mime_type = "image/jpeg" # Default
159
+ if image_base64.startswith('/9j/'):
160
+ mime_type = "image/jpeg"
161
+ elif image_base64.startswith('iVBORw0KGgo'):
162
+ mime_type = "image/png"
163
+
164
+ # Create the message with image and text
165
+ response = self.client.messages.create(
166
+ model=self.model_name,
167
+ max_tokens=1024,
168
+ messages=[
169
+ {
170
+ "role": "user",
171
+ "content": [
172
+ {
173
+ "type": "text",
174
+ "text": query
175
+ },
176
+ {
177
+ "type": "image",
178
+ "source": {
179
+ "type": "base64",
180
+ "media_type": mime_type,
181
+ "data": image_base64
182
+ }
183
+ }
184
+ ]
185
+ }
186
+ ]
187
+ )
188
+ return response.content[0].text
189
+
190
+ except Exception as e:
191
+ return f"Error analyzing image with Anthropic: {str(e)}"
192
+
193
+ elif self.model_provider == "openai":
194
+ # Handle OpenAI GPT-4 Vision models
195
+ try:
196
+ response = self.client.chat.completions.create(
197
+ model=self.model_name,
198
+ max_tokens=1024,
199
+ messages=[
200
+ {
201
+ "role": "user",
202
+ "content": [
203
+ {
204
+ "type": "text",
205
+ "text": query
206
+ },
207
+ {
208
+ "type": "image_url",
209
+ "image_url": {
210
+ "url": f"data:image/jpeg;base64,{image_base64}"
211
+ }
212
+ }
213
+ ]
214
+ }
215
+ ]
216
+ )
217
+ return response.choices[0].message.content
218
+
219
+ except Exception as e:
220
+ return f"Error analyzing image with OpenAI: {str(e)}"
221
+
222
+ else:
223
+ return "Unsupported model provider"
224
+
225
+ # Create a function tool for the vision analyzer
226
+ def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str:
227
+ """
228
+ Analyze an image using a vision-enabled model.
229
+
230
+ Args:
231
+ image_path: Path to the image file
232
+ query: The question or instruction for image analysis
233
+
234
+ Returns:
235
+ str: The analysis result from the vision model
236
+ """
237
+ try:
238
+ # Encode the image to base64
239
+ base64_image = encode_image_to_base64(image_path)
240
+
241
+ # Create a vision analyzer agent and analyze the image
242
+ vision_agent = VisionAnalyzerAgent()
243
+ result = vision_agent.analyze_image(base64_image, query)
244
+
245
+ return result
246
+ except Exception as e:
247
+ return f"Error analyzing image: {str(e)}"
248
+
249
+ # Create a function tool for vision analysis
250
+ vision_analyzer_tool = FunctionTool.from_defaults(
251
+ name="analyze_image_with_vision",
252
+ description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.",
253
+ fn=analyze_image_with_vision
254
  )