wt002 commited on
Commit
256b6ef
·
verified ·
1 Parent(s): 00a9519

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -72
app.py CHANGED
@@ -7,7 +7,12 @@ from typing import List, Dict, Union
7
  import requests
8
  import wikipediaapi
9
  import pandas as pd
10
- from duckduckgo_search import DDGS
 
 
 
 
 
11
 
12
  load_dotenv()
13
 
@@ -15,86 +20,275 @@ load_dotenv()
15
  # --- Constants ---
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
 
18
-
19
- # Custom search tool class
20
- class CustomDuckDuckGoSearchTool:
21
- def __call__(self, query: str, max_results: int = 5):
22
- try:
23
- with DDGS() as ddgs:
24
- results = []
25
- for r in ddgs.text(query):
26
- results.append(r)
27
- if len(results) >= max_results:
28
- break
29
- return results
30
- except Exception as e:
31
- return f"Search error: {str(e)}"
32
-
33
- # Dummy placeholder for `visit_webpage` tool
34
- class VisitWebpageTool:
35
- def __call__(self, url: str):
36
- return f"Pretending to visit: {url}"
37
-
38
- # Final answer tool to format and return the final response
39
- class FinalAnswerTool:
40
- def __call__(self, results):
41
- formatted_answer = "Final Answer:\n"
42
- for result in results:
43
- formatted_answer += f"- {str(result)}\n"
44
- return formatted_answer
45
-
46
- # Dummy model
47
- class DummyModel:
48
- def call(self, input_text):
49
- return f"Model processing: {input_text}"
50
-
51
- # Modified ToolCallingAgent to use FinalAnswerTool
52
- class ToolCallingAgent:
53
- def __init__(self, tools, model, final_answer_tool, max_steps=10):
54
- self.tools = tools
55
- self.model = model
56
- self.final_answer_tool = final_answer_tool
57
- self.max_steps = max_steps
58
-
59
- def run(self, query):
60
- print(f"Running agent with query: {query}")
61
- tool_outputs = []
62
- for tool in self.tools:
63
- output = tool(query)
64
- print("Tool output:", output)
65
- tool_outputs.append(output)
66
- # Use the final answer tool to format the collected outputs
67
- final_result = self.final_answer_tool(tool_outputs)
68
- print(final_result)
69
- return final_result
70
-
71
- # Initialize tools and model
72
- model = DummyModel()
73
- search_tool = CustomDuckDuckGoSearchTool()
74
- visit_webpage = VisitWebpageTool()
75
- final_answer = FinalAnswerTool()
76
-
77
- # Initialize the agent
78
- web_agent = ToolCallingAgent(
79
- tools=[search_tool, visit_webpage],
80
- model="google/gemma-7b",
81
- final_answer_tool=final_answer,
82
- max_steps=10
83
- )
84
-
85
- # Example usage
86
- #web_agent.run("Latest AI tools")
87
-
88
  # --- Basic Agent Definition ---
89
  class BasicAgent:
90
  def __init__(self):
91
  print("BasicAgent initialized.")
92
  def __call__(self, question: str) -> str:
93
  print(f"Agent received question (first 50 chars): {question[:50]}...")
94
- fixed_answer = web_agent.run({question})
95
  print(f"Agent returning fixed answer: {fixed_answer}")
96
  return fixed_answer
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def run_and_submit_all( profile: gr.OAuthProfile | None):
100
  """
 
7
  import requests
8
  import wikipediaapi
9
  import pandas as pd
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ import re
13
+ from urllib.parse import quote
14
+ import spacy
15
+ from googlesearch import search
16
 
17
  load_dotenv()
18
 
 
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # --- Basic Agent Definition ---
24
  class BasicAgent:
25
  def __init__(self):
26
  print("BasicAgent initialized.")
27
  def __call__(self, question: str) -> str:
28
  print(f"Agent received question (first 50 chars): {question[:50]}...")
29
+ fixed_answer = WebSearchAgent.run({question})
30
  print(f"Agent returning fixed answer: {fixed_answer}")
31
  return fixed_answer
32
 
33
+ class WebSearchAgent:
34
+ def __init__(self):
35
+ self.nlp = spacy.load("en_core_web_sm")
36
+ self.session = requests.Session()
37
+ self.session.headers.update({
38
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
39
+ })
40
+ self.cache = {}
41
+
42
+ def analyze_query(self, query):
43
+ """Analyze the query to determine intent and extract entities"""
44
+ doc = self.nlp(query)
45
+
46
+ analysis = {
47
+ 'entities': [(ent.text, ent.label_) for ent in doc.ents],
48
+ 'intent': self._determine_intent(query.lower()),
49
+ 'time_constraints': self._extract_time_constraints(query),
50
+ 'quantities': self._extract_quantities(query)
51
+ }
52
+ return analysis
53
+
54
+ def _determine_intent(self, query):
55
+ """Determine the intent of the query"""
56
+ if 'how many' in query:
57
+ return 'count'
58
+ elif 'when' in query:
59
+ return 'date'
60
+ elif 'who' in query:
61
+ return 'person'
62
+ elif 'what is' in query or 'define' in query:
63
+ return 'definition'
64
+ elif 'list' in query or 'name all' in query:
65
+ return 'list'
66
+ return 'general'
67
+
68
+ def _extract_time_constraints(self, text):
69
+ """Extract time ranges from text"""
70
+ constraints = []
71
+ # Match patterns like "between 2000 and 2009"
72
+ range_match = re.search(r'between (\d{4}) and (\d{4})', text)
73
+ if range_match:
74
+ constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
75
+
76
+ # Match patterns like "in 2005"
77
+ year_match = re.search(r'in (\d{4})', text)
78
+ if year_match:
79
+ constraints.append(('point', int(year_match.group(1))))
80
+
81
+ return constraints
82
+
83
+ def _extract_quantities(self, text):
84
+ """Extract numerical quantities from text"""
85
+ return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
86
+
87
+ def search_web(self, query, num_results=3):
88
+ """Search the web using multiple sources"""
89
+ sources = {
90
+ 'wikipedia': self._search_wikipedia,
91
+ 'google': self._search_google
92
+ }
93
+
94
+ results = []
95
+ for source_name, search_func in sources.items():
96
+ try:
97
+ results.extend(search_func(query, num_results))
98
+ except Exception as e:
99
+ print(f"Error searching {source_name}: {e}")
100
+
101
+ return results[:num_results*2] # Return max of double the requested results
102
+
103
+ def _search_wikipedia(self, query, num_results):
104
+ """Search Wikipedia API"""
105
+ url = "https://en.wikipedia.org/w/api.php"
106
+ params = {
107
+ 'action': 'query',
108
+ 'list': 'search',
109
+ 'srsearch': query,
110
+ 'format': 'json',
111
+ 'srlimit': num_results
112
+ }
113
+ response = self.session.get(url, params=params).json()
114
+ return [{
115
+ 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
116
+ 'title': item['title'],
117
+ 'snippet': item['snippet'],
118
+ 'source': 'wikipedia'
119
+ } for item in response['query']['search']]
120
+
121
+ def _search_google(self, query, num_results):
122
+ """Search Google using python-googlesearch"""
123
+ return [{
124
+ 'url': url,
125
+ 'source': 'google'
126
+ } for url in search(query, num_results=num_results, stop=num_results)]
127
+
128
+ def fetch_page(self, url):
129
+ """Fetch and parse a web page with caching"""
130
+ if url in self.cache:
131
+ return self.cache[url]
132
+
133
+ try:
134
+ response = self.session.get(url, timeout=10)
135
+ soup = BeautifulSoup(response.text, 'html.parser')
136
+
137
+ # Clean the page content
138
+ for element in soup(['script', 'style', 'nav', 'footer']):
139
+ element.decompose()
140
+
141
+ page_data = {
142
+ 'url': url,
143
+ 'title': soup.title.string if soup.title else '',
144
+ 'text': ' '.join(soup.stripped_strings),
145
+ 'soup': soup
146
+ }
147
+
148
+ self.cache[url] = page_data
149
+ return page_data
150
+ except Exception as e:
151
+ print(f"Error fetching {url}: {e}")
152
+ return None
153
+
154
+ def extract_answer(self, page, analysis):
155
+ """Extract relevant information from a page based on query analysis"""
156
+ if not page:
157
+ return None
158
+
159
+ if analysis['intent'] == 'count':
160
+ return self._extract_count(page['text'], analysis)
161
+ elif analysis['intent'] == 'date':
162
+ return self._extract_date(page['text'], analysis)
163
+ elif analysis['intent'] == 'list':
164
+ return self._extract_list(page['soup'], analysis)
165
+ else:
166
+ return self._extract_general(page['text'], analysis)
167
+
168
+ def _extract_count(self, text, analysis):
169
+ """Extract a count/number from text"""
170
+ entities = [e[0] for e in analysis['entities']]
171
+ pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
172
+ matches = re.finditer(pattern, text, re.IGNORECASE)
173
+
174
+ counts = [int(match.group(1))) for match in matches]
175
+ return max(counts) if counts else None
176
+
177
+ def _extract_date(self, text, analysis):
178
+ """Extract dates from text"""
179
+ date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
180
+ dates = [match.group(0) for match in re.finditer(date_pattern, text)]
181
+
182
+ entities = [e[0] for e in analysis['entities']]
183
+ return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
184
+
185
+ def _extract_list(self, soup, analysis):
186
+ """Extract list items from page"""
187
+ entities = [e[0] for e in analysis['entities']]
188
+ items = []
189
+
190
+ for list_tag in soup.find_all(['ul', 'ol']):
191
+ list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
192
+ if any(e.lower() in ' '.join(list_items).lower() for e in entities):
193
+ items.extend(list_items)
194
+
195
+ return items if items else None
196
+
197
+ def _extract_general(self, text, analysis):
198
+ """Extract general information from text"""
199
+ entities = [e[0] for e in analysis['entities']]
200
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
201
+
202
+ relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
203
+ return ' '.join(relevant) if relevant else None
204
+
205
+ def answer_question(self, question, num_sources=3):
206
+ """Main method to answer a question"""
207
+ print(f"Processing question: {question}")
208
+
209
+ # Step 1: Analyze the question
210
+ analysis = self.analyze_query(question)
211
+ print(f"Analysis: {analysis}")
212
+
213
+ # Step 2: Search the web
214
+ search_results = self.search_web(question, num_sources)
215
+ print(f"Found {len(search_results)} potential sources")
216
+
217
+ # Step 3: Fetch and analyze pages
218
+ answers = []
219
+ for result in search_results:
220
+ page = self.fetch_page(result['url'])
221
+ if page:
222
+ answer = self.extract_answer(page, analysis)
223
+ if answer:
224
+ answers.append({
225
+ 'answer': answer,
226
+ 'source': result['url'],
227
+ 'confidence': self._calculate_confidence(answer, analysis)
228
+ })
229
+
230
+ # Step 4: Return the best answer
231
+ if not answers:
232
+ return {"status": "No answers found"}
233
+
234
+ answers.sort(key=lambda x: x['confidence'], reverse=True)
235
+ return {
236
+ "question": question,
237
+ "best_answer": answers[0]['answer'],
238
+ "source": answers[0]['source'],
239
+ "confidence": answers[0]['confidence'],
240
+ "all_answers": answers
241
+ }
242
+
243
+ def _calculate_confidence(self, answer, analysis):
244
+ """Calculate confidence score for an answer"""
245
+ confidence = 0.5 # Base confidence
246
+
247
+ # Type matching
248
+ if analysis['intent'] == 'count' and isinstance(answer, int):
249
+ confidence += 0.3
250
+ elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
251
+ confidence += 0.3
252
+ elif analysis['intent'] == 'list' and isinstance(answer, list):
253
+ confidence += 0.3
254
+
255
+ # Time constraints
256
+ if analysis['time_constraints'] and str(answer):
257
+ for constraint in analysis['time_constraints']:
258
+ if constraint[0] == 'range':
259
+ years = re.findall(r'\b(19|20)\d{2}\b', str(answer))
260
+ if any(constraint[1] <= int(y) <= constraint[2] for y in years):
261
+ confidence += 0.2
262
+
263
+ return min(0.99, max(0.1, confidence)) # Keep within bounds
264
+
265
+ # Example usage
266
+ if __name__ == "__main__":
267
+ agent = WebSearchAgent()
268
+
269
+ questions = [
270
+ "How many studio albums were published by Taylor Swift between 2010 and 2015?",
271
+ "When was Albert Einstein born?",
272
+ "What is the capital of Australia?",
273
+ "List the members of The Beatles"
274
+ ]
275
+
276
+ for question in questions:
277
+ print("\n" + "="*50)
278
+ print(f"Question: {question}")
279
+ result = agent.answer_question(question)
280
+
281
+ print("\nBest Answer:")
282
+ if isinstance(result['best_answer'], list):
283
+ for item in result['best_answer']:
284
+ print(f"- {item}")
285
+ else:
286
+ print(result['best_answer'])
287
+
288
+ print(f"\nSource: {result['source']}")
289
+ print(f"Confidence: {result['confidence']:.0%}")
290
+
291
+
292
 
293
  def run_and_submit_all( profile: gr.OAuthProfile | None):
294
  """