File size: 9,523 Bytes
f0544fd
09a77ad
f0544fd
09a77ad
41cae26
 
56a4634
41cae26
56a4634
 
 
 
 
 
09a77ad
 
 
f0544fd
 
09a77ad
 
 
f0544fd
09a77ad
 
f0544fd
 
09a77ad
f0544fd
 
 
 
09a77ad
f0544fd
 
09a77ad
f0544fd
 
09a77ad
 
f0544fd
09a77ad
 
 
 
 
f0544fd
09a77ad
f0544fd
 
09a77ad
 
f0544fd
09a77ad
 
 
 
 
56a4634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227dcb0
56a4634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09a77ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
from typing import Optional, Dict, Any
from llama_index.readers.whisper import WhisperReader
from llama_index.core.tools import FunctionTool
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (
    ImageReader
)
import base64
import sys
import traceback
from PIL import Image
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic

class WhisperTranscriber:
    """Class for transcribing audio using OpenAI's Whisper model."""
    
    def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
        """Initialize the WhisperTranscriber with specified model and API key."""
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        self.model = model
        self.reader = WhisperReader(
            model=self.model,
            api_key=self.api_key,
        )
    
    def transcribe(self, audio_file_path: str) -> str:
        """
        Transcribe an audio file to text.
        
        Args:
            audio_file_path: Path to the audio file (.mp3, .wav, etc.)
            
        Returns:
            Transcribed text from the audio file
        """
        try:
            # Load data from audio file
            documents = self.reader.load_data(audio_file_path)
            
            # Extract and concatenate text from all returned documents
            if documents and len(documents) > 0:
                transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
                return transcription
            return "No transcription was generated from the audio file."
        except Exception as e:
            return f"Error transcribing audio file: {str(e)}"


# Initialize the transcriber
whisper_transcriber = WhisperTranscriber()

# Create a function tool for audio transcription
transcribe_audio_tool = FunctionTool.from_defaults(
    name="transcribe_audio",
    description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
    fn=whisper_transcriber.transcribe
)


def encode_image_to_base64(file_path: str) -> str:
    """
    Reads an image file and encodes it to a base64 string.
    
    This function focuses exclusively on generating a base64 encoded string from an image file.
    
    Args:
        file_path (str): Path to the image file to be encoded
            
    Returns:
        str: The base64 encoded string of the image
            
    Raises:
        FileNotFoundError: If the specified file doesn't exist
        ValueError: If the file has an unsupported extension
            
    Examples:
        >>> base64_data = encode_image_to_base64("data/photo.jpg")
    """    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at {file_path}")
    
    # Get file extension
    file_ext = os.path.splitext(file_path)[1].lower()
    supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']
    
    if file_ext not in supported_formats:
        raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}")
    
    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
        base64_image = encoded_string.decode('utf-8')
            
    return base64_image
        
# Create a function tool for image encoding
encode_image_tool = FunctionTool.from_defaults(
    name="encode_image_to_base64",
    description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.",
    fn=encode_image_to_base64
)

class VisionAnalyzerAgent:
    """
    A specialized agent for analyzing images using vision models.
    
    This agent can process images, analyze their content, and provide detailed descriptions
    or answer questions about the visual elements.
    """
    
    def __init__(
        self,
        model_provider: str = "openai", 
        model_name: str = "gpt-4o",
        api_key: Optional[str] = None,
        **kwargs
    ):
        """
        Initialize a VisionAnalyzerAgent.
        
        Args:
            model_provider: The LLM provider to use ("anthropic" or "openai")
            model_name: The specific model name to use
            api_key: API key for the provider (defaults to environment variable)
            **kwargs: Additional parameters for the model
        """
        self.model_provider = model_provider.lower()
        self.model_name = model_name
        self.api_key = api_key
        
        # Set up the vision model client
        if self.model_provider == "anthropic":
            self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY"))
        elif self.model_provider == "openai":
            self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
        else:
            raise ValueError(f"Unsupported model provider: {model_provider}. "
                            f"Supported providers are: anthropic, openai")
    
    def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str:
        """
        Analyze an image using the vision model.
        
        Args:
            image_base64: Base64 encoded image data
            query: The question or instruction for image analysis
            
        Returns:
            str: The analysis result from the vision model
        """
        # Prepare the image for the appropriate model
        if self.model_provider == "anthropic":
            # Handle Anthropic Claude models
            try:
                # Determine MIME type based on image data
                mime_type = "image/jpeg"  # Default
                if image_base64.startswith('/9j/'):
                    mime_type = "image/jpeg"
                elif image_base64.startswith('iVBORw0KGgo'):
                    mime_type = "image/png"
                
                # Create the message with image and text
                response = self.client.messages.create(
                    model=self.model_name,
                    max_tokens=1024,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": query
                                },
                                {
                                    "type": "image",
                                    "source": {
                                        "type": "base64",
                                        "media_type": mime_type,
                                        "data": image_base64
                                    }
                                }
                            ]
                        }
                    ]
                )
                return response.content[0].text
                
            except Exception as e:
                return f"Error analyzing image with Anthropic: {str(e)}"
                
        elif self.model_provider == "openai":
            # Handle OpenAI GPT-4 Vision models
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    max_tokens=1024*20,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": query
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{image_base64}"
                                    }
                                }
                            ]
                        }
                    ]
                )
                return response.choices[0].message.content
                
            except Exception as e:
                return f"Error analyzing image with OpenAI: {str(e)}"
        
        else:
            return "Unsupported model provider"

# Create a function tool for the vision analyzer
def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str:
    """
    Analyze an image using a vision-enabled model.
    
    Args:
        image_path: Path to the image file
        query: The question or instruction for image analysis
        
    Returns:
        str: The analysis result from the vision model
    """
    try:
        # Encode the image to base64
        base64_image = encode_image_to_base64(image_path)
        
        # Create a vision analyzer agent and analyze the image
        vision_agent = VisionAnalyzerAgent()
        result = vision_agent.analyze_image(base64_image, query)
        
        return result
    except Exception as e:
        return f"Error analyzing image: {str(e)}"

# Create a function tool for vision analysis
vision_analyzer_tool = FunctionTool.from_defaults(
    name="analyze_image_with_vision",
    description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.",
    fn=analyze_image_with_vision
)