import os from typing import Optional, Dict, Any from llama_index.readers.whisper import WhisperReader from llama_index.core.tools import FunctionTool from llama_index.core import SimpleDirectoryReader from llama_index.readers.file import ( ImageReader ) import base64 import sys import traceback from PIL import Image from llama_index.llms.openai import OpenAI from llama_index.llms.anthropic import Anthropic class WhisperTranscriber: """Class for transcribing audio using OpenAI's Whisper model.""" def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None): """Initialize the WhisperTranscriber with specified model and API key.""" self.api_key = api_key or os.getenv("OPENAI_API_KEY") self.model = model self.reader = WhisperReader( model=self.model, api_key=self.api_key, ) def transcribe(self, audio_file_path: str) -> str: """ Transcribe an audio file to text. Args: audio_file_path: Path to the audio file (.mp3, .wav, etc.) Returns: Transcribed text from the audio file """ try: # Load data from audio file documents = self.reader.load_data(audio_file_path) # Extract and concatenate text from all returned documents if documents and len(documents) > 0: transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')]) return transcription return "No transcription was generated from the audio file." except Exception as e: return f"Error transcribing audio file: {str(e)}" # Initialize the transcriber whisper_transcriber = WhisperTranscriber() # Create a function tool for audio transcription transcribe_audio_tool = FunctionTool.from_defaults( name="transcribe_audio", description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.", fn=whisper_transcriber.transcribe ) def encode_image_to_base64(file_path: str) -> str: """ Reads an image file and encodes it to a base64 string. This function focuses exclusively on generating a base64 encoded string from an image file. Args: file_path (str): Path to the image file to be encoded Returns: str: The base64 encoded string of the image Raises: FileNotFoundError: If the specified file doesn't exist ValueError: If the file has an unsupported extension Examples: >>> base64_data = encode_image_to_base64("data/photo.jpg") """ # Check if file exists if not os.path.exists(file_path): raise FileNotFoundError(f"File not found at {file_path}") # Get file extension file_ext = os.path.splitext(file_path)[1].lower() supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp'] if file_ext not in supported_formats: raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}") with open(file_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) base64_image = encoded_string.decode('utf-8') return base64_image # Create a function tool for image encoding encode_image_tool = FunctionTool.from_defaults( name="encode_image_to_base64", description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.", fn=encode_image_to_base64 ) class VisionAnalyzerAgent: """ A specialized agent for analyzing images using vision models. This agent can process images, analyze their content, and provide detailed descriptions or answer questions about the visual elements. """ def __init__( self, model_provider: str = "openai", model_name: str = "gpt-4o", api_key: Optional[str] = None, **kwargs ): """ Initialize a VisionAnalyzerAgent. Args: model_provider: The LLM provider to use ("anthropic" or "openai") model_name: The specific model name to use api_key: API key for the provider (defaults to environment variable) **kwargs: Additional parameters for the model """ self.model_provider = model_provider.lower() self.model_name = model_name self.api_key = api_key # Set up the vision model client if self.model_provider == "anthropic": self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY")) elif self.model_provider == "openai": self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY")) else: raise ValueError(f"Unsupported model provider: {model_provider}. " f"Supported providers are: anthropic, openai") def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str: """ Analyze an image using the vision model. Args: image_base64: Base64 encoded image data query: The question or instruction for image analysis Returns: str: The analysis result from the vision model """ # Prepare the image for the appropriate model if self.model_provider == "anthropic": # Handle Anthropic Claude models try: # Determine MIME type based on image data mime_type = "image/jpeg" # Default if image_base64.startswith('/9j/'): mime_type = "image/jpeg" elif image_base64.startswith('iVBORw0KGgo'): mime_type = "image/png" # Create the message with image and text response = self.client.messages.create( model=self.model_name, max_tokens=1024, messages=[ { "role": "user", "content": [ { "type": "text", "text": query }, { "type": "image", "source": { "type": "base64", "media_type": mime_type, "data": image_base64 } } ] } ] ) return response.content[0].text except Exception as e: return f"Error analyzing image with Anthropic: {str(e)}" elif self.model_provider == "openai": # Handle OpenAI GPT-4 Vision models try: response = self.client.chat.completions.create( model=self.model_name, max_tokens=1024*20, messages=[ { "role": "user", "content": [ { "type": "text", "text": query }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_base64}" } } ] } ] ) return response.choices[0].message.content except Exception as e: return f"Error analyzing image with OpenAI: {str(e)}" else: return "Unsupported model provider" # Create a function tool for the vision analyzer def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str: """ Analyze an image using a vision-enabled model. Args: image_path: Path to the image file query: The question or instruction for image analysis Returns: str: The analysis result from the vision model """ try: # Encode the image to base64 base64_image = encode_image_to_base64(image_path) # Create a vision analyzer agent and analyze the image vision_agent = VisionAnalyzerAgent() result = vision_agent.analyze_image(base64_image, query) return result except Exception as e: return f"Error analyzing image: {str(e)}" # Create a function tool for vision analysis vision_analyzer_tool = FunctionTool.from_defaults( name="analyze_image_with_vision", description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.", fn=analyze_image_with_vision )