wt002 commited on
Commit
b2a7d74
·
verified ·
1 Parent(s): e06cf2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -92
app.py CHANGED
@@ -3,7 +3,7 @@ from dotenv import load_dotenv
3
  import gradio as gr
4
  import requests
5
 
6
- from typing import List, Dict, Union
7
  import pandas as pd
8
  import wikipediaapi
9
  import requests
@@ -22,104 +22,129 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
  # --- Basic Agent Definition ---
24
 
25
- import requests
26
- from bs4 import BeautifulSoup
27
- import urllib.parse
28
- import re
29
- from typing import Optional
30
-
31
  class BasicAgent:
32
  def __init__(self):
33
  print("BasicAgent initialized.")
34
- self.headers = {
35
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
36
- 'Accept-Language': 'en-US,en;q=0.9'
37
- }
38
- self.answer_patterns = {
39
- 'definition': r'(?:is|are|was|were) (?:an?|the)? (.+?)(?:\.|,)',
40
- 'quantity': r'(?:is|are|was|were) (?:about|approximately)? (\d+[\d,\.]*\s*\w+)',
41
- 'person': r'(?:by|named) (.+?)(?:\.|,)',
42
- 'date': r'(?:on|in) (.+? \d{4}|\d{1,2} [A-Za-z]+ \d{4})'
43
- }
44
-
45
  def __call__(self, question: str) -> str:
46
  print(f"Agent received question (first 50 chars): {question[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Try Wikipedia first for factual questions
49
- if self._is_wikipedia_question(question):
50
- answer = self._search_wikipedia(question)
51
- if answer and answer != "No answer found":
52
- return answer
53
 
54
- # Fall back to Google search
55
- answer = self._search_google(question)
56
- print(f"Agent returning answer: {answer[:50]}...")
57
- return answer
58
-
59
- def _is_wikipedia_question(self, question: str) -> bool:
60
- """Check if question is suitable for Wikipedia"""
61
- question_lower = question.lower()
62
- return any(keyword in question_lower
63
- for keyword in ['who', 'what', 'when', 'where', 'why', 'how', 'define'])
64
-
65
- def _search_wikipedia(self, question: str) -> str:
66
- """Search Wikipedia directly for answers"""
67
- try:
68
- # Extract main topic from question
69
- topic = re.sub(r'(who|what|when|where|why|how|is|are|was|were|did|does|do)\s+', '', question, flags=re.IGNORECASE)
70
- topic = re.sub(r'\?.*', '', topic).strip()
71
-
72
- url = f"https://en.wikipedia.org/wiki/{urllib.parse.quote(topic.replace(' ', '_'))}"
73
- response = requests.get(url, headers=self.headers, timeout=5)
74
-
75
- if response.status_code == 200:
76
- soup = BeautifulSoup(response.text, 'html.parser')
77
- first_paragraph = soup.select_one('div.mw-parser-output > p:not(.mw-empty-elt)')
78
-
79
- if first_paragraph:
80
- text = first_paragraph.get_text()
81
- # Try to extract most relevant sentence
82
- for pattern_type, pattern in self.answer_patterns.items():
83
- match = re.search(pattern, text, re.IGNORECASE)
84
- if match:
85
- return f"{match.group(1).strip()} (Source: Wikipedia)"
86
-
87
- return text.split('.')[0] + " (Source: Wikipedia)"
88
-
89
- return "No answer found"
90
-
91
- except Exception:
92
- return "No answer found"
93
-
94
- def _search_google(self, question: str) -> str:
95
- """Search Google for answers"""
96
- try:
97
- url = f"https://www.google.com/search?q={urllib.parse.quote(question)}"
98
- response = requests.get(url, headers=self.headers, timeout=5)
99
- soup = BeautifulSoup(response.text, 'html.parser')
100
-
101
- # Check Google's answer boxes
102
- for selector in ['.Z0LcW', '.LGOjhe', '.hgKElc', '.kno-rdesc span']:
103
- element = soup.select_one(selector)
104
- if element:
105
- return element.get_text() + " (Source: Google)"
106
-
107
- # Try featured snippet
108
- snippet = soup.select_one('.xpdopen .kno-rdesc span, .ifM9O')
109
- if snippet:
110
- return snippet.get_text() + " (Source: Google)"
111
-
112
- # Fallback to first result summary
113
- first_result = soup.select_one('.tF2Cxc')
114
- if first_result:
115
- summary = first_result.select_one('.IsZvec, .VwiC3b')
116
- if summary:
117
- return summary.get_text()[:150] + "... (Source: Google)"
118
-
119
- return "No concise answer found"
120
-
121
- except Exception:
122
- return "Search failed"
123
 
124
 
125
 
 
3
  import gradio as gr
4
  import requests
5
 
6
+ from typing import List, Dict, Union, Optional
7
  import pandas as pd
8
  import wikipediaapi
9
  import requests
 
22
 
23
  # --- Basic Agent Definition ---
24
 
 
 
 
 
 
 
25
  class BasicAgent:
26
  def __init__(self):
27
  print("BasicAgent initialized.")
28
+
 
 
 
 
 
 
 
 
 
 
29
  def __call__(self, question: str) -> str:
30
  print(f"Agent received question (first 50 chars): {question[:50]}...")
31
+ fixed_answer = self.process_request(question)
32
+ print(f"Agent returning answer: {fixed_answer}")
33
+ return fixed_answer
34
+
35
+ def process_request(self, question: str) -> str:
36
+ return "This is a default answer."
37
+
38
+ class SearchAgent(BasicAgent):
39
+ def __init__(self):
40
+ super().__init__()
41
+ print("SearchAgent specialized initialization.")
42
+
43
+ def process_request(self, query: str) -> str:
44
+ # In a real implementation, this would call a search API
45
+ mock_results = [
46
+ {"url": f"https://example.com/result{i}", "title": f"Result {i} for {query[:20]}..."}
47
+ for i in range(1, 4)
48
+ ]
49
+ return str(mock_results)
50
+
51
+ class BrowserAgent(BasicAgent):
52
+ def __init__(self):
53
+ super().__init__()
54
+ self.current_page = None
55
+ self.history = []
56
+ self.session = requests.Session()
57
+ self.session.headers.update({'User-Agent': 'WebNavigator/1.0'})
58
+ print("BrowserAgent initialized with fresh session.")
59
+
60
+ def process_request(self, url: str) -> str:
61
+ try:
62
+ response = self.session.get(url)
63
+ response.raise_for_status()
64
+ self.current_page = {
65
+ 'url': url,
66
+ 'content': response.text,
67
+ 'timestamp': datetime.now()
68
+ }
69
+ self.history.append(self.current_page)
70
+ return f"Successfully retrieved page: {url}"
71
+ except Exception as e:
72
+ return f"Error visiting {url}: {str(e)}"
73
+
74
+ class ContentExtractorAgent(BasicAgent):
75
+ def __init__(self):
76
+ super().__init__()
77
+ print("ContentExtractorAgent initialized.")
78
+
79
+ def process_request(self, html: str) -> str:
80
+ soup = BeautifulSoup(html, 'html.parser')
81
 
82
+ # Remove unwanted elements
83
+ for element in soup(['script', 'style', 'nav', 'footer']):
84
+ element.decompose()
 
 
85
 
86
+ title = soup.title.string if soup.title else ""
87
+ main_content = soup.find('main') or soup.find('article') or soup.body
88
+
89
+ extracted = {
90
+ 'title': title,
91
+ 'text': main_content.get_text(separator='\n', strip=True),
92
+ 'links': [a['href'] for a in main_content.find_all('a', href=True)]
93
+ }
94
+ return str(extracted)
95
+
96
+ class WebNavigator(BasicAgent):
97
+ def __init__(self):
98
+ super().__init__()
99
+ self.search_agent = SearchAgent()
100
+ self.browser_agent = BrowserAgent()
101
+ self.extractor_agent = ContentExtractorAgent()
102
+ self.search_history = []
103
+ print("WebNavigator fully initialized with all sub-agents.")
104
+
105
+ def process_request(self, question: str) -> str:
106
+ # First try to interpret as a direct URL
107
+ if question.startswith(('http://', 'https://')):
108
+ return self.get_page_summary(question)
109
+
110
+ # Otherwise treat as search query
111
+ return self.search_and_extract(question)
112
+
113
+ def search_and_extract(self, query: str) -> str:
114
+ search_results = eval(self.search_agent(query)) # Convert string output back to list
115
+ extracted_data = []
116
+
117
+ for result in search_results:
118
+ visit_result = self.browser_agent(result['url'])
119
+ if "Successfully" in visit_result:
120
+ html = eval(self.browser_agent.current_page['content']) # Get stored HTML
121
+ content = self.extractor_agent(html)
122
+ extracted_data.append({
123
+ 'query': query,
124
+ 'url': result['url'],
125
+ 'content': eval(content) # Convert string output back to dict
126
+ })
127
+
128
+ self.search_history.append({
129
+ 'query': query,
130
+ 'timestamp': datetime.now(),
131
+ 'results': extracted_data
132
+ })
133
+
134
+ return str(extracted_data)
135
+
136
+ def get_page_summary(self, url: str) -> str:
137
+ visit_result = self.browser_agent(url)
138
+ if "Successfully" in visit_result:
139
+ html = eval(self.browser_agent.current_page['content'])
140
+ content = eval(self.extractor_agent(html))
141
+ return str({
142
+ 'url': url,
143
+ 'title': content['title'],
144
+ 'summary': ' '.join(content['text'].split()[:100]) + '...'
145
+ })
146
+ return visit_result # Return the error message
147
+
 
 
 
 
 
 
 
148
 
149
 
150