wt002 commited on
Commit
a37281a
Β·
verified Β·
1 Parent(s): 29bc439

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -187
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import gradio as gr
3
  import requests
 
4
  import speech_recognition as sr
5
  from smolagents import OpenAIServerModel, DuckDuckGoSearchTool, CodeAgent, WikipediaSearchTool
6
  from pathlib import Path
@@ -16,189 +17,28 @@ from langchain.agents import initialize_agent
16
  from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
17
  from langchain_community.llms import HuggingFaceHub
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # (Keep Constants as is)
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
 
24
- class SpeechToTextTool(PipelineTool):
25
- """
26
- Transcribes an audio file to text using the OpenAI Whisper API.
27
- Only local file paths are supported.
28
- """
29
- default_checkpoint = "openai/whisper-1" # purely informational here
30
- description = (
31
- "This tool sends an audio file to OpenAI Whisper and returns the "
32
- "transcribed text."
33
- )
34
- name = "transcriber"
35
- inputs = {
36
- "audio": {
37
- "type": "string",
38
- "description": "Absolute or relative path to a local audio file.",
39
- }
40
- }
41
- output_type = "string"
42
-
43
- # ──────────────────────────────────────────────────────────────────────────
44
- # Public interface
45
- # ──────────────────────────────────────────────────────────────────────────
46
- def __call__(self, audio: str) -> str:
47
- """
48
- Convenience wrapper so the tool can be used like a regular function:
49
- text = SpeechToTextTool()(path_to_audio)
50
- """
51
- return self._transcribe(audio)
52
-
53
- # ──────────────────────────────────────────────────────────────────────────
54
- # Internal helpers
55
- # ──────────────────────────────────────────────────────────────────────────
56
- @staticmethod
57
- def _transcribe(audio_path: str) -> str:
58
- # ----- validation ----------------------------------------------------
59
- if not isinstance(audio_path, str):
60
- raise TypeError(
61
- "Parameter 'audio' must be a string containing the file path."
62
- )
63
- path = Path(audio_path).expanduser().resolve()
64
- if not path.is_file():
65
- raise FileNotFoundError(f"No such audio file: {path}")
66
-
67
- # ----- API call ------------------------------------------------------
68
- with path.open("rb") as fp:
69
- response = openai.audio.transcriptions.create(
70
- file=fp,
71
- model="whisper-1", # currently the only Whisper model
72
- response_format="text" # returns plain text instead of JSON
73
- )
74
-
75
- # For response_format="text", `response` is already the raw transcript
76
- return response
77
-
78
- def transcribe_audio(audio_file_path):
79
- recognizer = sr.Recognizer()
80
- with sr.AudioFile(audio_file_path) as source:
81
- audio_data = recognizer.record(source)
82
- try:
83
- text = recognizer.recognize_google(audio_data)
84
- return text
85
- except sr.UnknownValueError:
86
- return "Could not understand audio"
87
- except sr.RequestError:
88
- return "Could not request results (check internet connection)"
89
-
90
-
91
-
92
-
93
- class ExcelToTextTool(Tool):
94
- """Render an Excel worksheet as Markdown text."""
95
-
96
- # ------------------------------------------------------------------
97
- # Required smol‑agents metadata
98
- # ------------------------------------------------------------------
99
- name = "excel_to_text"
100
- description = (
101
- "Read an Excel file and return a Markdown table of the requested sheet. "
102
- "Accepts either the sheet name or the zero-based index."
103
- )
104
-
105
- inputs = {
106
- "excel_path": {
107
- "type": "string",
108
- "description": "Path to the Excel file (.xlsx / .xls).",
109
- },
110
- "sheet_name": {
111
- "type": "string",
112
- "description": (
113
- "Worksheet name or zero‑based index *as a string* (optional; default first sheet)."
114
- ),
115
- "nullable": True,
116
- },
117
- }
118
-
119
- output_type = "string"
120
-
121
- # ------------------------------------------------------------------
122
- # Core logic
123
- # ------------------------------------------------------------------
124
- def forward(
125
- self,
126
- excel_path: str,
127
- sheet_name: Optional[str] = None,
128
- ) -> str:
129
- """Load *excel_path* and return the sheet as a Markdown table."""
130
-
131
- path = pathlib.Path(excel_path).expanduser().resolve()
132
- if not path.exists():
133
- return f"Error: Excel file not found at {path}"
134
-
135
- try:
136
- # Interpret sheet identifier -----------------------------------
137
- sheet: Union[str, int]
138
- if sheet_name is None or sheet_name == "":
139
- sheet = 0 # first sheet
140
- else:
141
- # If the user passed a numeric string (e.g. "1"), cast to int
142
- sheet = int(sheet_name) if sheet_name.isdigit() else sheet_name
143
-
144
- # Load worksheet ----------------------------------------------
145
- df = pd.read_excel(path, sheet_name=sheet)
146
-
147
- # Render to Markdown; fall back to tabulate if needed ---------
148
- if hasattr(pd.DataFrame, "to_markdown"):
149
- return df.to_markdown(index=False)
150
- from tabulate import tabulate # pragma: no cover – fallback path
151
-
152
- return tabulate(df, headers="keys", tablefmt="github", showindex=False)
153
-
154
- except Exception as exc: # broad catch keeps the agent chat‑friendly
155
- return f"Error reading Excel file: {exc}"
156
-
157
-
158
- def download_file_if_any(base_api_url: str, task_id: str) -> str | None:
159
- """
160
- Try GET /files/{task_id}.
161
- β€’ On HTTP 200 β†’ save to a temp dir and return local path.
162
- β€’ On 404 β†’ return None.
163
- β€’ On other errors β†’ raise so caller can log / handle.
164
- """
165
- url = f"{base_api_url}/files/{task_id}"
166
- try:
167
- resp = requests.get(url, timeout=30)
168
- if resp.status_code == 404:
169
- return None # no file
170
- resp.raise_for_status() # raise on 4xx/5xx β‰  404
171
- except requests.exceptions.HTTPError as e:
172
- # propagate non-404 errors (403, 500, …)
173
- raise e
174
-
175
- # β–Έ Save bytes to a named file inside the system temp dir
176
- # Try to keep original extension from Content-Disposition if present.
177
- cdisp = resp.headers.get("content-disposition", "")
178
- filename = task_id # default base name
179
- if "filename=" in cdisp:
180
- m = re.search(r'filename="([^"]+)"', cdisp)
181
- if m:
182
- filename = m.group(1) # keep provided name
183
-
184
- tmp_dir = Path(tempfile.gettempdir()) / "gaia_files"
185
- tmp_dir.mkdir(exist_ok=True)
186
- file_path = tmp_dir / filename
187
- with open(file_path, "wb") as f:
188
- f.write(resp.content)
189
- return str(file_path)
190
-
191
  # --- Basic Agent Definition ---
192
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
193
-
194
  class BasicAgent:
195
  def __init__(self):
196
- # Initialize LLM (requires HuggingFace API token)
197
- llm = HuggingFaceHub(
198
- repo_id="meta-llama/Meta-Llama-3-8B-Instruct" #,
199
- # huggingfacehub_api_token="your_token"
200
- )
201
-
202
  print("BasicAgent initialized.")
203
 
204
  def __call__(self, question: str) -> str:
@@ -207,23 +47,162 @@ class BasicAgent:
207
  print(f"Agent returning answer: {fixed_answer}")
208
  return fixed_answer
209
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  # Initialize tools
212
- tools = [
213
- DuckDuckGoSearchRun(),
214
- WikipediaQueryRun()
215
- # Would need custom implementations for other tools
216
- ]
217
 
218
- self.agent = initialize_agent(
219
- tools=tools,
220
- llm=llm,
221
- agent="zero-shot-react-description",
222
- verbose=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  )
224
 
225
- def run(self, prompt):
226
- return self.agent.run(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  def run_and_submit_all( profile: gr.OAuthProfile | None):
229
  """
 
1
  import os
2
  import gradio as gr
3
  import requests
4
+
5
  import speech_recognition as sr
6
  from smolagents import OpenAIServerModel, DuckDuckGoSearchTool, CodeAgent, WikipediaSearchTool
7
  from pathlib import Path
 
17
  from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
18
  from langchain_community.llms import HuggingFaceHub
19
 
20
+ from typing import Union
21
+ import os
22
+ from langchain.agents import AgentExecutor, Tool, initialize_agent
23
+ from langchain_community.llms import Ollama
24
+ from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
25
+ from langchain_community.document_loaders import (
26
+ CSVLoader,
27
+ PyPDFLoader,
28
+ UnstructuredWordDocumentLoader
29
+ )
30
+ from langchain_community.utilities import TextRequestsWrapper
31
+ import speech_recognition as sr
32
+ from pydub import AudioSegment # For audio format conversion
33
+
34
  # (Keep Constants as is)
35
  # --- Constants ---
36
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # --- Basic Agent Definition ---
 
 
40
  class BasicAgent:
41
  def __init__(self):
 
 
 
 
 
 
42
  print("BasicAgent initialized.")
43
 
44
  def __call__(self, question: str) -> str:
 
47
  print(f"Agent returning answer: {fixed_answer}")
48
  return fixed_answer
49
 
50
+
51
+ def __init__(self, model_name: str = "llama3"):
52
+ """
53
+ Open-source multi-modal agent with:
54
+ - Web search
55
+ - Document processing
56
+ - Speech-to-text
57
+ - URL content fetching
58
+ """
59
+ # Initialize LLM (local via Ollama)
60
+ self.llm = Ollama(model=model_name, temperature=0.7)
61
 
62
  # Initialize tools
63
+ self.search_tool = DuckDuckGoSearchRun()
64
+ self.wikipedia_tool = WikipediaQueryRun()
65
+ self.requests_tool = TextRequestsWrapper()
 
 
66
 
67
+ # Speech recognition
68
+ self.recognizer = sr.Recognizer()
69
+
70
+ # Initialize agent
71
+ self.tools = self._initialize_tools()
72
+ self.agent = self._create_agent()
73
+
74
+ def _initialize_tools(self) -> list[Tool]:
75
+ """Initialize all available tools"""
76
+ return [
77
+ Tool(
78
+ name="Web Search",
79
+ func=self.search_tool.run,
80
+ description="For current events/unknown topics"
81
+ ),
82
+ Tool(
83
+ name="Wikipedia",
84
+ func=self.wikipedia_tool.run,
85
+ description="For factual information"
86
+ ),
87
+ Tool(
88
+ name="Document Loader",
89
+ func=self.process_document,
90
+ description="Processes PDF, Word, CSV files"
91
+ ),
92
+ Tool(
93
+ name="Speech Transcription",
94
+ func=self.transcribe_audio,
95
+ description="Converts speech from audio files to text"
96
+ ),
97
+ Tool(
98
+ name="Website Content",
99
+ func=self.requests_tool.get,
100
+ description="Fetches content from URLs"
101
+ )
102
+ ]
103
+
104
+ def _create_agent(self) -> AgentExecutor:
105
+ """Create the agent executor"""
106
+ return initialize_agent(
107
+ tools=self.tools,
108
+ llm=self.llm,
109
+ agent="structured-chat-react",
110
+ verbose=True,
111
+ handle_parsing_errors=True
112
  )
113
 
114
+ def process_document(self, file_path: str) -> str:
115
+ """Handle different document types"""
116
+ if not os.path.exists(file_path):
117
+ return "File not found"
118
+
119
+ ext = os.path.splitext(file_path)[1].lower()
120
+
121
+ try:
122
+ if ext == '.pdf':
123
+ loader = PyPDFLoader(file_path)
124
+ elif ext in ('.doc', '.docx'):
125
+ loader = UnstructuredWordDocumentLoader(file_path)
126
+ elif ext == '.csv':
127
+ loader = CSVLoader(file_path)
128
+ else:
129
+ return "Unsupported file format"
130
+
131
+ docs = loader.load()
132
+ return "\n".join([doc.page_content for doc in docs])
133
+
134
+ except Exception as e:
135
+ return f"Error processing document: {str(e)}"
136
+
137
+ def _convert_audio_format(self, audio_path: str) -> str:
138
+ """Convert audio to WAV format if needed"""
139
+ if audio_path.endswith('.wav'):
140
+ return audio_path
141
+
142
+ try:
143
+ sound = AudioSegment.from_file(audio_path)
144
+ wav_path = os.path.splitext(audio_path)[0] + ".wav"
145
+ sound.export(wav_path, format="wav")
146
+ return wav_path
147
+ except:
148
+ return audio_path # Fallback to original if conversion fails
149
+
150
+ def transcribe_audio(self, audio_path: str) -> str:
151
+ """Convert speech to text using purely open-source tools"""
152
+ audio_path = self._convert_audio_format(audio_path)
153
+
154
+ try:
155
+ with sr.AudioFile(audio_path) as source:
156
+ audio = self.recognizer.record(source)
157
+ return self.recognizer.recognize_vosk(audio) # Offline recognition
158
+ except sr.UnknownValueError:
159
+ try:
160
+ # Fallback to Sphinx if Vosk fails
161
+ return self.recognizer.recognize_sphinx(audio)
162
+ except Exception as e:
163
+ return f"Transcription failed: {str(e)}"
164
+
165
+ def run(self, input_data: Union[str, dict]) -> str:
166
+ """
167
+ Handle different input types:
168
+ - Text queries
169
+ - File paths
170
+ - Structured requests
171
+ """
172
+ if isinstance(input_data, dict):
173
+ if 'query' in input_data:
174
+ return self.agent.run(input_data['query'])
175
+ elif 'file' in input_data:
176
+ content = self.process_document(input_data['file'])
177
+ return self.agent.run(f"Process this: {content}")
178
+ elif isinstance(input_data, str):
179
+ if input_data.endswith(('.pdf', '.docx', '.csv')):
180
+ content = self.process_document(input_data)
181
+ return self.agent.run(f"Process this document: {content}")
182
+ elif input_data.endswith(('.wav', '.mp3', '.ogg')):
183
+ content = self.transcribe_audio(input_data)
184
+ return self.agent.run(f"Process this transcript: {content}")
185
+ else:
186
+ return self.agent.run(input_data)
187
+ return "Unsupported input type"
188
+
189
+ # Usage Example
190
+ if __name__ == "__main__":
191
+ agent = FullyOpenSourceAgent(model_name="mistral") # Try "llama3", "gemma", etc.
192
+
193
+ # Example 1: Web search
194
+ print(agent.run("Latest breakthroughs in renewable energy"))
195
+
196
+ # Example 2: Process document
197
+ print(agent.run({"file": "research.pdf"}))
198
+
199
+ # Example 3: Complex workflow
200
+ print(agent.run({
201
+ "query": "Summarize the key points from this meeting recording",
202
+ "file": "meeting.wav"
203
+ }))
204
+
205
+
206
 
207
  def run_and_submit_all( profile: gr.OAuthProfile | None):
208
  """