Final_Assignment_Project

Running

App Files Files Community

wt002 commited on 4 days ago

Commit

a37281a

verified ·

1 Parent(s): 29bc439

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -187

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import gradio as gr
 import requests
 import speech_recognition as sr
 from smolagents import OpenAIServerModel, DuckDuckGoSearchTool, CodeAgent, WikipediaSearchTool
 from pathlib import Path
@@ -16,189 +17,28 @@ from langchain.agents import initialize_agent
 from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
 from langchain_community.llms import HuggingFaceHub
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-class SpeechToTextTool(PipelineTool):
-    """
-    Transcribes an audio file to text using the OpenAI Whisper API.
-    Only local file paths are supported.
-    """
-    default_checkpoint = "openai/whisper-1"   # purely informational here
-    description = (
-        "This tool sends an audio file to OpenAI Whisper and returns the "
-        "transcribed text."
-    )
-    name = "transcriber"
-    inputs = {
-        "audio": {
-            "type": "string",
-            "description": "Absolute or relative path to a local audio file.",
-        }
-    }
-    output_type = "string"
-    # ──────────────────────────────────────────────────────────────────────────
-    # Public interface
-    # ──────────────────────────────────────────────────────────────────────────
-    def __call__(self, audio: str) -> str:
-        """
-        Convenience wrapper so the tool can be used like a regular function:
-            text = SpeechToTextTool()(path_to_audio)
-        """
-        return self._transcribe(audio)
-    # ──────────────────────────────────────────────────────────────────────────
-    # Internal helpers
-    # ──────────────────────────────────────────────────────────────────────────
-    @staticmethod
-    def _transcribe(audio_path: str) -> str:
-        # ----- validation ----------------------------------------------------
-        if not isinstance(audio_path, str):
-            raise TypeError(
-                "Parameter 'audio' must be a string containing the file path."
-            )
-        path = Path(audio_path).expanduser().resolve()
-        if not path.is_file():
-            raise FileNotFoundError(f"No such audio file: {path}")
-        # ----- API call ------------------------------------------------------
-        with path.open("rb") as fp:
-            response = openai.audio.transcriptions.create(
-                file=fp,
-                model="whisper-1",        # currently the only Whisper model
-                response_format="text"    # returns plain text instead of JSON
-            )
-        # For response_format="text", `response` is already the raw transcript
-        return response
-    def transcribe_audio(audio_file_path):
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(audio_file_path) as source:
-        audio_data = recognizer.record(source)
-        try:
-            text = recognizer.recognize_google(audio_data)
-            return text
-        except sr.UnknownValueError:
-            return "Could not understand audio"
-        except sr.RequestError:
-            return "Could not request results (check internet connection)"
-class ExcelToTextTool(Tool):
-    """Render an Excel worksheet as Markdown text."""
-    # ------------------------------------------------------------------
-    # Required smol‑agents metadata
-    # ------------------------------------------------------------------
-    name = "excel_to_text"
-    description = (
-        "Read an Excel file and return a Markdown table of the requested sheet. "
-        "Accepts either the sheet name or the zero-based index."
-    )
-    inputs = {
-        "excel_path": {
-            "type": "string",
-            "description": "Path to the Excel file (.xlsx / .xls).",
-        },
-        "sheet_name": {
-            "type": "string",
-            "description": (
-                "Worksheet name or zero‑based index *as a string* (optional; default first sheet)."
-            ),
-            "nullable": True,
-        },
-    }
-    output_type = "string"
-    # ------------------------------------------------------------------
-    # Core logic
-    # ------------------------------------------------------------------
-    def forward(
-            self,
-            excel_path: str,
-            sheet_name: Optional[str] = None,
-    ) -> str:
-        """Load *excel_path* and return the sheet as a Markdown table."""
-        path = pathlib.Path(excel_path).expanduser().resolve()
-        if not path.exists():
-            return f"Error: Excel file not found at {path}"
-        try:
-            # Interpret sheet identifier -----------------------------------
-            sheet: Union[str, int]
-            if sheet_name is None or sheet_name == "":
-                sheet = 0  # first sheet
-            else:
-                # If the user passed a numeric string (e.g. "1"), cast to int
-                sheet = int(sheet_name) if sheet_name.isdigit() else sheet_name
-            # Load worksheet ----------------------------------------------
-            df = pd.read_excel(path, sheet_name=sheet)
-            # Render to Markdown; fall back to tabulate if needed ---------
-            if hasattr(pd.DataFrame, "to_markdown"):
-                return df.to_markdown(index=False)
-            from tabulate import tabulate  # pragma: no cover – fallback path
-            return tabulate(df, headers="keys", tablefmt="github", showindex=False)
-        except Exception as exc:  # broad catch keeps the agent chat‑friendly
-            return f"Error reading Excel file: {exc}"
-def download_file_if_any(base_api_url: str, task_id: str) -> str | None:
-    """
-    Try GET /files/{task_id}.
-    • On HTTP 200 → save to a temp dir and return local path.
-    • On 404 → return None.
-    • On other errors → raise so caller can log / handle.
-    """
-    url = f"{base_api_url}/files/{task_id}"
-    try:
-        resp = requests.get(url, timeout=30)
-        if resp.status_code == 404:
-            return None          # no file
-        resp.raise_for_status()   # raise on 4xx/5xx ≠ 404
-    except requests.exceptions.HTTPError as e:
-        # propagate non-404 errors (403, 500, …)
-        raise e
-    # ▸ Save bytes to a named file inside the system temp dir
-    #    Try to keep original extension from Content-Disposition if present.
-    cdisp = resp.headers.get("content-disposition", "")
-    filename = task_id                                 # default base name
-    if "filename=" in cdisp:
-        m = re.search(r'filename="([^"]+)"', cdisp)
-        if m:
-            filename = m.group(1)                      # keep provided name
-    tmp_dir = Path(tempfile.gettempdir()) / "gaia_files"
-    tmp_dir.mkdir(exist_ok=True)
-    file_path = tmp_dir / filename
-    with open(file_path, "wb") as f:
-        f.write(resp.content)
-    return str(file_path)
 # --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
-        # Initialize LLM (requires HuggingFace API token)
-        llm = HuggingFaceHub(
-            repo_id="meta-llama/Meta-Llama-3-8B-Instruct"  #,
-         #   huggingfacehub_api_token="your_token"
-        )
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
@@ -207,23 +47,162 @@ class BasicAgent:
         print(f"Agent returning answer: {fixed_answer}")
         return fixed_answer
         # Initialize tools
-        tools = [
-            DuckDuckGoSearchRun(),
-            WikipediaQueryRun()
-            # Would need custom implementations for other tools
-        ]
-        self.agent = initialize_agent(
-            tools=tools,
-            llm=llm,
-            agent="zero-shot-react-description",
-            verbose=True
         )
-    def run(self, prompt):
-        return self.agent.run(prompt)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

 import os
 import gradio as gr
 import requests
 import speech_recognition as sr
 from smolagents import OpenAIServerModel, DuckDuckGoSearchTool, CodeAgent, WikipediaSearchTool
 from pathlib import Path
 from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
 from langchain_community.llms import HuggingFaceHub
+from typing import Union
+import os
+from langchain.agents import AgentExecutor, Tool, initialize_agent
+from langchain_community.llms import Ollama
+from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
+from langchain_community.document_loaders import (
+    CSVLoader,
+    PyPDFLoader,
+    UnstructuredWordDocumentLoader
+)
+from langchain_community.utilities import TextRequestsWrapper
+import speech_recognition as sr
+from pydub import AudioSegment  # For audio format conversion
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent returning answer: {fixed_answer}")
         return fixed_answer
+    def __init__(self, model_name: str = "llama3"):
+        """
+        Open-source multi-modal agent with:
+        - Web search
+        - Document processing
+        - Speech-to-text
+        - URL content fetching
+        """
+        # Initialize LLM (local via Ollama)
+        self.llm = Ollama(model=model_name, temperature=0.7)
         # Initialize tools
+        self.search_tool = DuckDuckGoSearchRun()
+        self.wikipedia_tool = WikipediaQueryRun()
+        self.requests_tool = TextRequestsWrapper()
+        # Speech recognition
+        self.recognizer = sr.Recognizer()
+        # Initialize agent
+        self.tools = self._initialize_tools()
+        self.agent = self._create_agent()
+    def _initialize_tools(self) -> list[Tool]:
+        """Initialize all available tools"""
+        return [
+            Tool(
+                name="Web Search",
+                func=self.search_tool.run,
+                description="For current events/unknown topics"
+            ),
+            Tool(
+                name="Wikipedia",
+                func=self.wikipedia_tool.run,
+                description="For factual information"
+            ),
+            Tool(
+                name="Document Loader",
+                func=self.process_document,
+                description="Processes PDF, Word, CSV files"
+            ),
+            Tool(
+                name="Speech Transcription",
+                func=self.transcribe_audio,
+                description="Converts speech from audio files to text"
+            ),
+            Tool(
+                name="Website Content",
+                func=self.requests_tool.get,
+                description="Fetches content from URLs"
+            )
+        ]
+    def _create_agent(self) -> AgentExecutor:
+        """Create the agent executor"""
+        return initialize_agent(
+            tools=self.tools,
+            llm=self.llm,
+            agent="structured-chat-react",
+            verbose=True,
+            handle_parsing_errors=True
         )
+    def process_document(self, file_path: str) -> str:
+        """Handle different document types"""
+        if not os.path.exists(file_path):
+            return "File not found"
+        ext = os.path.splitext(file_path)[1].lower()
+        try:
+            if ext == '.pdf':
+                loader = PyPDFLoader(file_path)
+            elif ext in ('.doc', '.docx'):
+                loader = UnstructuredWordDocumentLoader(file_path)
+            elif ext == '.csv':
+                loader = CSVLoader(file_path)
+            else:
+                return "Unsupported file format"
+            docs = loader.load()
+            return "\n".join([doc.page_content for doc in docs])
+        except Exception as e:
+            return f"Error processing document: {str(e)}"
+    def _convert_audio_format(self, audio_path: str) -> str:
+        """Convert audio to WAV format if needed"""
+        if audio_path.endswith('.wav'):
+            return audio_path
+        try:
+            sound = AudioSegment.from_file(audio_path)
+            wav_path = os.path.splitext(audio_path)[0] + ".wav"
+            sound.export(wav_path, format="wav")
+            return wav_path
+        except:
+            return audio_path  # Fallback to original if conversion fails
+    def transcribe_audio(self, audio_path: str) -> str:
+        """Convert speech to text using purely open-source tools"""
+        audio_path = self._convert_audio_format(audio_path)
+        try:
+            with sr.AudioFile(audio_path) as source:
+                audio = self.recognizer.record(source)
+                return self.recognizer.recognize_vosk(audio)  # Offline recognition
+        except sr.UnknownValueError:
+            try:
+                # Fallback to Sphinx if Vosk fails
+                return self.recognizer.recognize_sphinx(audio)
+            except Exception as e:
+                return f"Transcription failed: {str(e)}"
+    def run(self, input_data: Union[str, dict]) -> str:
+        """
+        Handle different input types:
+        - Text queries
+        - File paths
+        - Structured requests
+        """
+        if isinstance(input_data, dict):
+            if 'query' in input_data:
+                return self.agent.run(input_data['query'])
+            elif 'file' in input_data:
+                content = self.process_document(input_data['file'])
+                return self.agent.run(f"Process this: {content}")
+        elif isinstance(input_data, str):
+            if input_data.endswith(('.pdf', '.docx', '.csv')):
+                content = self.process_document(input_data)
+                return self.agent.run(f"Process this document: {content}")
+            elif input_data.endswith(('.wav', '.mp3', '.ogg')):
+                content = self.transcribe_audio(input_data)
+                return self.agent.run(f"Process this transcript: {content}")
+            else:
+                return self.agent.run(input_data)
+        return "Unsupported input type"
+# Usage Example
+if __name__ == "__main__":
+    agent = FullyOpenSourceAgent(model_name="mistral")  # Try "llama3", "gemma", etc.
+    # Example 1: Web search
+    print(agent.run("Latest breakthroughs in renewable energy"))
+    # Example 2: Process document
+    print(agent.run({"file": "research.pdf"}))
+    # Example 3: Complex workflow
+    print(agent.run({
+        "query": "Summarize the key points from this meeting recording",
+        "file": "meeting.wav"
+    }))
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """