agents_course_final_assignement

Paused

File size: 9,523 Bytes

import os
from typing import Optional, Dict, Any
from llama_index.readers.whisper import WhisperReader
from llama_index.core.tools import FunctionTool
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (
    ImageReader
)
import base64
import sys
import traceback
from PIL import Image
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic

class WhisperTranscriber:
    """Class for transcribing audio using OpenAI's Whisper model."""
    
    def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
        """Initialize the WhisperTranscriber with specified model and API key."""
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        self.model = model
        self.reader = WhisperReader(
            model=self.model,
            api_key=self.api_key,
        )
    
    def transcribe(self, audio_file_path: str) -> str:
        """
        Transcribe an audio file to text.
        
        Args:
            audio_file_path: Path to the audio file (.mp3, .wav, etc.)
            
        Returns:
            Transcribed text from the audio file
        """
        try:
            # Load data from audio file
            documents = self.reader.load_data(audio_file_path)
            
            # Extract and concatenate text from all returned documents
            if documents and len(documents) > 0:
                transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
                return transcription
            return "No transcription was generated from the audio file."
        except Exception as e:
            return f"Error transcribing audio file: {str(e)}"


# Initialize the transcriber
whisper_transcriber = WhisperTranscriber()

# Create a function tool for audio transcription
transcribe_audio_tool = FunctionTool.from_defaults(
    name="transcribe_audio",
    description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
    fn=whisper_transcriber.transcribe
)


def encode_image_to_base64(file_path: str) -> str:
    """
    Reads an image file and encodes it to a base64 string.
    
    This function focuses exclusively on generating a base64 encoded string from an image file.
    
    Args:
        file_path (str): Path to the image file to be encoded
            
    Returns:
        str: The base64 encoded string of the image
            
    Raises:
        FileNotFoundError: If the specified file doesn't exist
        ValueError: If the file has an unsupported extension
            
    Examples:
        >>> base64_data = encode_image_to_base64("data/photo.jpg")
    """    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at {file_path}")
    
    # Get file extension
    file_ext = os.path.splitext(file_path)[1].lower()
    supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']
    
    if file_ext not in supported_formats:
        raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}")
    
    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
        base64_image = encoded_string.decode('utf-8')
            
    return base64_image
        
# Create a function tool for image encoding
encode_image_tool = FunctionTool.from_defaults(
    name="encode_image_to_base64",
    description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.",
    fn=encode_image_to_base64
)

class VisionAnalyzerAgent:
    """
    A specialized agent for analyzing images using vision models.
    
    This agent can process images, analyze their content, and provide detailed descriptions
    or answer questions about the visual elements.
    """
    
    def __init__(
        self,
        model_provider: str = "openai", 
        model_name: str = "gpt-4o",
        api_key: Optional[str] = None,
        **kwargs
    ):
        """
        Initialize a VisionAnalyzerAgent.
        
        Args:
            model_provider: The LLM provider to use ("anthropic" or "openai")
            model_name: The specific model name to use
            api_key: API key for the provider (defaults to environment variable)
            **kwargs: Additional parameters for the model
        """
        self.model_provider = model_provider.lower()
        self.model_name = model_name
        self.api_key = api_key
        
        # Set up the vision model client
        if self.model_provider == "anthropic":
            self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY"))
        elif self.model_provider == "openai":
            self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
        else:
            raise ValueError(f"Unsupported model provider: {model_provider}. "
                            f"Supported providers are: anthropic, openai")
    
    def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str:
        """
        Analyze an image using the vision model.
        
        Args:
            image_base64: Base64 encoded image data
            query: The question or instruction for image analysis
            
        Returns:
            str: The analysis result from the vision model
        """
        # Prepare the image for the appropriate model
        if self.model_provider == "anthropic":
            # Handle Anthropic Claude models
            try:
                # Determine MIME type based on image data
                mime_type = "image/jpeg"  # Default
                if image_base64.startswith('/9j/'):
                    mime_type = "image/jpeg"
                elif image_base64.startswith('iVBORw0KGgo'):
                    mime_type = "image/png"
                
                # Create the message with image and text
                response = self.client.messages.create(
                    model=self.model_name,
                    max_tokens=1024,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": query
                                },
                                {
                                    "type": "image",
                                    "source": {
                                        "type": "base64",
                                        "media_type": mime_type,
                                        "data": image_base64
                                    }
                                }
                            ]
                        }
                    ]
                )
                return response.content[0].text
                
            except Exception as e:
                return f"Error analyzing image with Anthropic: {str(e)}"
                
        elif self.model_provider == "openai":
            # Handle OpenAI GPT-4 Vision models
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    max_tokens=1024*20,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": query
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{image_base64}"
                                    }
                                }
                            ]
                        }
                    ]
                )
                return response.choices[0].message.content
                
            except Exception as e:
                return f"Error analyzing image with OpenAI: {str(e)}"
        
        else:
            return "Unsupported model provider"

# Create a function tool for the vision analyzer
def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str:
    """
    Analyze an image using a vision-enabled model.
    
    Args:
        image_path: Path to the image file
        query: The question or instruction for image analysis
        
    Returns:
        str: The analysis result from the vision model
    """
    try:
        # Encode the image to base64
        base64_image = encode_image_to_base64(image_path)
        
        # Create a vision analyzer agent and analyze the image
        vision_agent = VisionAnalyzerAgent()
        result = vision_agent.analyze_image(base64_image, query)
        
        return result
    except Exception as e:
        return f"Error analyzing image: {str(e)}"

# Create a function tool for vision analysis
vision_analyzer_tool = FunctionTool.from_defaults(
    name="analyze_image_with_vision",
    description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.",
    fn=analyze_image_with_vision
)