agents_course_final_assignement

Paused

App Files Files Community

drAbreu commited on 8 days ago

Commit

09a77ad

1 Parent(s): f0544fd

Adding audio tools with whisper

Browse files

Files changed (2) hide show

agents/llama_index_agent.py +16 -5
tools/multimedia_tools.py +29 -93

agents/llama_index_agent.py CHANGED Viewed

@@ -8,7 +8,9 @@ from typing import Optional, List, Any, Dict
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.anthropic import Anthropic
 # In your GaiaAgent class initialization, add these imports at the top
-from tools.multimedia_tools import transcribe_audio, get_audio_metadata
 from tools.web_tools import (
     tavily_tool,
@@ -61,8 +63,7 @@ class GaiaAgent(ReActAgent):
                 wikipedia_tool.load_data,
                 wikipedia_tool.search_data,
                 tavily_tool.search,
-                transcribe_audio,
-                get_audio_metadata
                 ]
         # Use default system prompt if not provided
@@ -174,11 +175,22 @@ class GaiaAgent(ReActAgent):
         ## SOLVING METHODOLOGY
         1. For each question, thoroughly work through the reasoning step-by-step
-        2. Use available tools (reverse_text_tool, search tools) when needed
         3. Document your full analysis, including all key facts, calculations, and relevant information
         4. Clearly identify what you believe the correct answer is
         5. Be extremely explicit about the required formatting for the final answer
         ## DELEGATION TO WRITER AGENT
         After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
         - query: The original question
@@ -197,7 +209,6 @@ class GaiaAgent(ReActAgent):
         IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
         """
 def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
     """
     Create a writer agent that formats final answers based on research notes.

 from llama_index.llms.openai import OpenAI
 from llama_index.llms.anthropic import Anthropic
 # In your GaiaAgent class initialization, add these imports at the top
+from tools.multimedia_tools import (
+    transcribe_audio_tool,
+    )
 from tools.web_tools import (
     tavily_tool,
                 wikipedia_tool.load_data,
                 wikipedia_tool.search_data,
                 tavily_tool.search,
+                transcribe_audio_tool,
                 ]
         # Use default system prompt if not provided
         ## SOLVING METHODOLOGY
         1. For each question, thoroughly work through the reasoning step-by-step
+        2. Use available tools when needed:
+        - reverse_text_tool: For reversing text
+        - search tools (wikipedia_tool, tavily_tool): For finding information
+        - transcribe_audio: For transcribing audio files (provide the path to the audio file)
+        - get_audio_metadata: For getting metadata about audio files
         3. Document your full analysis, including all key facts, calculations, and relevant information
         4. Clearly identify what you believe the correct answer is
         5. Be extremely explicit about the required formatting for the final answer
+        ## HANDLING AUDIO TASKS
+        When dealing with audio files:
+        1. Use the transcribe_audio tool to get a full transcript of the audio content
+        2. Extract the specific information requested from the transcript
+        3. Format your answer exactly as requested in the question
+        4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed
         ## DELEGATION TO WRITER AGENT
         After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with:
         - query: The original question
         IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting.
         """
 def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent:
     """
     Create a writer agent that formats final answers based on research notes.

tools/multimedia_tools.py CHANGED Viewed

@@ -1,114 +1,50 @@
 import os
-from typing import Optional, List, Tuple
-from pathlib import Path
 from llama_index.readers.whisper import WhisperReader
-from llama_index.core.schema import Document
-class WhisperTool:
-    """Tool for transcribing audio files using OpenAI's Whisper model."""
     def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
-        """
-        Initialize the WhisperTool.
-        Args:
-            model: The Whisper model to use
-            api_key: OpenAI API key (defaults to OPENAI_API_KEY environment variable)
-        """
         self.reader = WhisperReader(
-            model=model,
-            api_key=api_key or os.getenv("OPENAI_API_KEY"),
         )
-    def transcribe(self, audio_path: str) -> str:
         """
         Transcribe an audio file to text.
         Args:
-            audio_path: Path to the audio file
-        Returns:
-            Transcribed text content
-        """
-        try:
-            documents = self.reader.load_data(audio_path)
-            if not documents:
-                return "No transcription was generated."
-            # Return the combined text from all documents
-            return "\n\n".join([doc.text for doc in documents])
-        except Exception as e:
-            return f"Error transcribing audio: {str(e)}"
-    async def transcribe_async(self, audio_path: str) -> str:
-        """
-        Transcribe an audio file to text asynchronously.
-        Args:
-            audio_path: Path to the audio file
         Returns:
-            Transcribed text content
         """
         try:
-            documents = await self.reader.aload_data(audio_path)
-            if not documents:
-                return "No transcription was generated."
-            # Return the combined text from all documents
-            return "\n\n".join([doc.text for doc in documents])
         except Exception as e:
-            return f"Error transcribing audio: {str(e)}"
-    def get_metadata(self, audio_path: str) -> dict:
-        """
-        Get metadata about an audio file.
-        Args:
-            audio_path: Path to the audio file
-        Returns:
-            Dictionary containing metadata
-        """
-        path = Path(audio_path)
-        try:
-            # Basic file metadata
-            metadata = {
-                "filename": path.name,
-                "extension": path.suffix,
-                "size_bytes": path.stat().st_size if path.exists() else None,
-                "exists": path.exists(),
-                "is_file": path.is_file() if path.exists() else None,
-            }
-            return metadata
-        except Exception as e:
-            return {"error": str(e)}
-# Create a singleton instance for use as a tool
-whisper_tool = WhisperTool()
-# Define tool functions that can be used directly with LlamaIndex
-def transcribe_audio(audio_path: str) -> str:
-    """
-    Transcribe an audio file to text.
-    Args:
-        audio_path: Path to the audio file
-    Returns:
-        Transcribed text content
-    """
-    return whisper_tool.transcribe(audio_path)
-def get_audio_metadata(audio_path: str) -> dict:
-    """
-    Get metadata about an audio file.
-    Args:
-        audio_path: Path to the audio file
-    Returns:
-        Dictionary containing metadata
-    """
-    return whisper_tool.get_metadata(audio_path)

 import os
+from typing import Optional, Dict, Any
 from llama_index.readers.whisper import WhisperReader
+from llama_index.core.tools import FunctionTool
+class WhisperTranscriber:
+    """Class for transcribing audio using OpenAI's Whisper model."""
     def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
+        """Initialize the WhisperTranscriber with specified model and API key."""
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        self.model = model
         self.reader = WhisperReader(
+            model=self.model,
+            api_key=self.api_key,
         )
+    def transcribe(self, audio_file_path: str) -> str:
         """
         Transcribe an audio file to text.
         Args:
+            audio_file_path: Path to the audio file (.mp3, .wav, etc.)
         Returns:
+            Transcribed text from the audio file
         """
         try:
+            # Load data from audio file
+            documents = self.reader.load_data(audio_file_path)
+            # Extract and concatenate text from all returned documents
+            if documents and len(documents) > 0:
+                transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
+                return transcription
+            return "No transcription was generated from the audio file."
         except Exception as e:
+            return f"Error transcribing audio file: {str(e)}"
+# Initialize the transcriber
+whisper_transcriber = WhisperTranscriber()
+# Create a function tool for audio transcription
+transcribe_audio_tool = FunctionTool.from_defaults(
+    name="transcribe_audio",
+    description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
+    fn=whisper_transcriber.transcribe
+)