agents_course_final_assignement

Paused

App Files Files Community

drAbreu commited on 7 days ago

Commit

56a4634

1 Parent(s): ab81a57

Added vision analyzer tool / agent

Browse files

Files changed (3) hide show

agents/llama_index_agent.py +20 -1
requirements.txt +2 -1
tools/multimedia_tools.py +202 -22

agents/llama_index_agent.py CHANGED Viewed

@@ -11,6 +11,8 @@ from llama_index.llms.anthropic import Anthropic
 # In your GaiaAgent class initialization, add these imports at the top
 from tools.multimedia_tools import (
     transcribe_audio_tool,
     )
 from tools.web_tools import (
@@ -72,7 +74,9 @@ class GaiaAgent(ReActAgent):
                 tavily_tool.search,
                 transcribe_audio_tool,
                 execute_python_file_tool,
-                csv_excel_reader_tool
                 ]
         # Use default system prompt if not provided
@@ -158,6 +162,21 @@ class GaiaAgent(ReActAgent):
         3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
         4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
         ## HANDLING CSV OR EXCEL DATA TASKS
         When dealing with CSV files or data analysis tasks:
         1. Check if a CSV file path is mentioned in the question or available in the context

 # In your GaiaAgent class initialization, add these imports at the top
 from tools.multimedia_tools import (
     transcribe_audio_tool,
+    encode_image_tool,
+    vision_analyzer_tool
     )
 from tools.web_tools import (
                 tavily_tool.search,
                 transcribe_audio_tool,
                 execute_python_file_tool,
+                csv_excel_reader_tool,
+                encode_image_tool,
+                vision_analyzer_tool
                 ]
         # Use default system prompt if not provided
         3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names)
         4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
+        ## HANDLING IMAGE ANALYSIS TASKS
+        When dealing with image files for visual analysis:
+        1. First, check if an image file path is mentioned in the question or available in the context
+        2. For image analysis, follow this two-step process:
+            a. Use the encode_image_to_base64 tool to convert the image to a base64 string
+            b. Pass the image path and a specific analysis question to analyze_image_with_vision
+        3. The vision analyzer can perform various visual analysis tasks:
+            - General image description: "Describe this image in detail"
+            - Specific information extraction: "What text appears in this image?"
+            - Visual problem solving: "How many people are in this image?"
+            - Object identification: "What brands/products are visible in this image?"
+        4. Be specific in your analysis requests to get the most relevant information
+        5. For tasks that require both text extraction and visual analysis, prioritize using the vision analyzer
+        6. Always document your analysis and include relevant details in your notes to the writer_agent
         ## HANDLING CSV OR EXCEL DATA TASKS
         When dealing with CSV files or data analysis tasks:
         1. Check if a CSV file path is mentioned in the question or available in the context

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ llama-index-llms-anthropic
 llama-index-llms-openai
 llama-index-readers-whisper
 llama-index-readers-file
-openpyxl

 llama-index-llms-openai
 llama-index-readers-whisper
 llama-index-readers-file
+openpyxl
+Pillow

tools/multimedia_tools.py CHANGED Viewed

@@ -4,29 +4,14 @@ from llama_index.readers.whisper import WhisperReader
 from llama_index.core.tools import FunctionTool
 from llama_index.core import SimpleDirectoryReader
 from llama_index.readers.file import (
-    DocxReader,
-    HWPReader,
-    PDFReader,
-    EpubReader,
-    FlatReader,
-    HTMLTagReader,
-    ImageCaptionReader,
-    ImageReader,
-    ImageVisionLLMReader,
-    IPYNBReader,
-    MarkdownReader,
-    MboxReader,
-    PptxReader,
-    PandasCSVReader,
-    VideoAudioReader,
-    UnstructuredReader,
-    PyMuPDFReader,
-    ImageTabularChartReader,
-    XMLReader,
-    PagedCSVReader,
-    CSVReader,
-    RTFReader,
 )
 class WhisperTranscriber:
     """Class for transcribing audio using OpenAI's Whisper model."""
@@ -71,4 +56,199 @@ transcribe_audio_tool = FunctionTool.from_defaults(
     name="transcribe_audio",
     description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
     fn=whisper_transcriber.transcribe
 )

 from llama_index.core.tools import FunctionTool
 from llama_index.core import SimpleDirectoryReader
 from llama_index.readers.file import (
+    ImageReader
 )
+import base64
+import sys
+import traceback
+from PIL import Image
+from llama_index.llms.openai import OpenAI
+from llama_index.llms.anthropic import Anthropic
 class WhisperTranscriber:
     """Class for transcribing audio using OpenAI's Whisper model."""
     name="transcribe_audio",
     description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
     fn=whisper_transcriber.transcribe
+)
+def encode_image_to_base64(file_path: str) -> str:
+    """
+    Reads an image file and encodes it to a base64 string.
+    This function focuses exclusively on generating a base64 encoded string from an image file.
+    Args:
+        file_path (str): Path to the image file to be encoded
+    Returns:
+        str: The base64 encoded string of the image
+    Raises:
+        FileNotFoundError: If the specified file doesn't exist
+        ValueError: If the file has an unsupported extension
+    Examples:
+        >>> base64_data = encode_image_to_base64("data/photo.jpg")
+    """
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found at {file_path}")
+    # Get file extension
+    file_ext = os.path.splitext(file_path)[1].lower()
+    supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']
+    if file_ext not in supported_formats:
+        raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}")
+    with open(file_path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+        base64_image = encoded_string.decode('utf-8')
+    return base64_image
+# Create a function tool for image encoding
+encode_image_tool = FunctionTool.from_defaults(
+    name="encode_image_to_base64",
+    description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.",
+    fn=encode_image_to_base64
+)
+class VisionAnalyzerAgent:
+    """
+    A specialized agent for analyzing images using vision models.
+    This agent can process images, analyze their content, and provide detailed descriptions
+    or answer questions about the visual elements.
+    """
+    def __init__(
+        self,
+        model_provider: str = "openai",
+        model_name: str = "gpt-4o",
+        api_key: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Initialize a VisionAnalyzerAgent.
+        Args:
+            model_provider: The LLM provider to use ("anthropic" or "openai")
+            model_name: The specific model name to use
+            api_key: API key for the provider (defaults to environment variable)
+            **kwargs: Additional parameters for the model
+        """
+        self.model_provider = model_provider.lower()
+        self.model_name = model_name
+        self.api_key = api_key
+        # Set up the vision model client
+        if self.model_provider == "anthropic":
+            self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY"))
+        elif self.model_provider == "openai":
+            self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
+        else:
+            raise ValueError(f"Unsupported model provider: {model_provider}. "
+                            f"Supported providers are: anthropic, openai")
+    def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str:
+        """
+        Analyze an image using the vision model.
+        Args:
+            image_base64: Base64 encoded image data
+            query: The question or instruction for image analysis
+        Returns:
+            str: The analysis result from the vision model
+        """
+        # Prepare the image for the appropriate model
+        if self.model_provider == "anthropic":
+            # Handle Anthropic Claude models
+            try:
+                # Determine MIME type based on image data
+                mime_type = "image/jpeg"  # Default
+                if image_base64.startswith('/9j/'):
+                    mime_type = "image/jpeg"
+                elif image_base64.startswith('iVBORw0KGgo'):
+                    mime_type = "image/png"
+                # Create the message with image and text
+                response = self.client.messages.create(
+                    model=self.model_name,
+                    max_tokens=1024,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": query
+                                },
+                                {
+                                    "type": "image",
+                                    "source": {
+                                        "type": "base64",
+                                        "media_type": mime_type,
+                                        "data": image_base64
+                                    }
+                                }
+                            ]
+                        }
+                    ]
+                )
+                return response.content[0].text
+            except Exception as e:
+                return f"Error analyzing image with Anthropic: {str(e)}"
+        elif self.model_provider == "openai":
+            # Handle OpenAI GPT-4 Vision models
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model_name,
+                    max_tokens=1024,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": query
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:image/jpeg;base64,{image_base64}"
+                                    }
+                                }
+                            ]
+                        }
+                    ]
+                )
+                return response.choices[0].message.content
+            except Exception as e:
+                return f"Error analyzing image with OpenAI: {str(e)}"
+        else:
+            return "Unsupported model provider"
+# Create a function tool for the vision analyzer
+def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str:
+    """
+    Analyze an image using a vision-enabled model.
+    Args:
+        image_path: Path to the image file
+        query: The question or instruction for image analysis
+    Returns:
+        str: The analysis result from the vision model
+    """
+    try:
+        # Encode the image to base64
+        base64_image = encode_image_to_base64(image_path)
+        # Create a vision analyzer agent and analyze the image
+        vision_agent = VisionAnalyzerAgent()
+        result = vision_agent.analyze_image(base64_image, query)
+        return result
+    except Exception as e:
+        return f"Error analyzing image: {str(e)}"
+# Create a function tool for vision analysis
+vision_analyzer_tool = FunctionTool.from_defaults(
+    name="analyze_image_with_vision",
+    description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.",
+    fn=analyze_image_with_vision
 )