wt002 commited on
Commit
f71d65e
·
verified ·
1 Parent(s): 1619cab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -117
app.py CHANGED
@@ -11,7 +11,8 @@ import requests
11
  from bs4 import BeautifulSoup
12
  import re
13
  from urllib.parse import quote
14
- import spacy
 
15
  from googlesearch import search
16
 
17
  load_dotenv()
@@ -26,13 +27,14 @@ class BasicAgent:
26
  print("BasicAgent initialized.")
27
  def __call__(self, question: str) -> str:
28
  print(f"Agent received question (first 50 chars): {question[:50]}...")
29
- fixed_answer = WebSearchAgent.run({question})
30
  print(f"Agent returning fixed answer: {fixed_answer}")
31
  return fixed_answer
32
 
33
- class WebSearchAgent:
 
 
34
  def __init__(self):
35
- self.nlp = spacy.load("en_core_web_sm")
36
  self.session = requests.Session()
37
  self.session.headers.update({
38
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
@@ -40,19 +42,23 @@ class WebSearchAgent:
40
  self.cache = {}
41
 
42
  def analyze_query(self, query):
43
- """Analyze the query to determine intent and extract entities"""
44
- doc = self.nlp(query)
45
-
46
  analysis = {
47
- 'entities': [(ent.text, ent.label_) for ent in doc.ents],
48
  'intent': self._determine_intent(query.lower()),
49
  'time_constraints': self._extract_time_constraints(query),
50
  'quantities': self._extract_quantities(query)
51
  }
52
  return analysis
53
 
 
 
 
 
 
 
54
  def _determine_intent(self, query):
55
- """Determine the intent of the query"""
56
  if 'how many' in query:
57
  return 'count'
58
  elif 'when' in query:
@@ -68,12 +74,10 @@ class WebSearchAgent:
68
  def _extract_time_constraints(self, text):
69
  """Extract time ranges from text"""
70
  constraints = []
71
- # Match patterns like "between 2000 and 2009"
72
  range_match = re.search(r'between (\d{4}) and (\d{4})', text)
73
  if range_match:
74
  constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
75
 
76
- # Match patterns like "in 2005"
77
  year_match = re.search(r'in (\d{4})', text)
78
  if year_match:
79
  constraints.append(('point', int(year_match.group(1))))
@@ -85,45 +89,39 @@ class WebSearchAgent:
85
  return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
86
 
87
  def search_web(self, query, num_results=3):
88
- """Search the web using multiple sources"""
89
- sources = {
90
- 'wikipedia': self._search_wikipedia,
91
- 'google': self._search_google
92
- }
93
-
94
  results = []
95
- for source_name, search_func in sources.items():
96
- try:
97
- results.extend(search_func(query, num_results))
98
- except Exception as e:
99
- print(f"Error searching {source_name}: {e}")
100
 
101
- return results[:num_results*2] # Return max of double the requested results
102
-
103
- def _search_wikipedia(self, query, num_results):
104
- """Search Wikipedia API"""
105
- url = "https://en.wikipedia.org/w/api.php"
106
- params = {
107
- 'action': 'query',
108
- 'list': 'search',
109
- 'srsearch': query,
110
- 'format': 'json',
111
- 'srlimit': num_results
112
- }
113
- response = self.session.get(url, params=params).json()
114
- return [{
115
- 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
116
- 'title': item['title'],
117
- 'snippet': item['snippet'],
118
- 'source': 'wikipedia'
119
- } for item in response['query']['search']]
120
-
121
- def _search_google(self, query, num_results):
122
- """Search Google using python-googlesearch"""
123
- return [{
124
- 'url': url,
125
- 'source': 'google'
126
- } for url in search(query, num_results=num_results, stop=num_results)]
 
 
 
 
127
 
128
  def fetch_page(self, url):
129
  """Fetch and parse a web page with caching"""
@@ -151,11 +149,53 @@ class WebSearchAgent:
151
  print(f"Error fetching {url}: {e}")
152
  return None
153
 
154
- def extract_answer(self, page, analysis):
155
- """Extract relevant information from a page based on query analysis"""
156
- if not page:
157
- return None
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  if analysis['intent'] == 'count':
160
  return self._extract_count(page['text'], analysis)
161
  elif analysis['intent'] == 'date':
@@ -170,15 +210,13 @@ class WebSearchAgent:
170
  entities = [e[0] for e in analysis['entities']]
171
  pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
172
  matches = re.finditer(pattern, text, re.IGNORECASE)
173
-
174
- counts = [int(match.group(1)) for match in matches]
175
  return max(counts) if counts else None
176
 
177
  def _extract_date(self, text, analysis):
178
  """Extract dates from text"""
179
  date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
180
  dates = [match.group(0) for match in re.finditer(date_pattern, text)]
181
-
182
  entities = [e[0] for e in analysis['entities']]
183
  return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
184
 
@@ -186,65 +224,23 @@ class WebSearchAgent:
186
  """Extract list items from page"""
187
  entities = [e[0] for e in analysis['entities']]
188
  items = []
189
-
190
  for list_tag in soup.find_all(['ul', 'ol']):
191
  list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
192
  if any(e.lower() in ' '.join(list_items).lower() for e in entities):
193
  items.extend(list_items)
194
-
195
  return items if items else None
196
 
197
  def _extract_general(self, text, analysis):
198
  """Extract general information from text"""
199
  entities = [e[0] for e in analysis['entities']]
200
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
201
-
202
  relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
203
  return ' '.join(relevant) if relevant else None
204
 
205
- def answer_question(self, question, num_sources=3):
206
- """Main method to answer a question"""
207
- print(f"Processing question: {question}")
208
-
209
- # Step 1: Analyze the question
210
- analysis = self.analyze_query(question)
211
- print(f"Analysis: {analysis}")
212
-
213
- # Step 2: Search the web
214
- search_results = self.search_web(question, num_sources)
215
- print(f"Found {len(search_results)} potential sources")
216
-
217
- # Step 3: Fetch and analyze pages
218
- answers = []
219
- for result in search_results:
220
- page = self.fetch_page(result['url'])
221
- if page:
222
- answer = self.extract_answer(page, analysis)
223
- if answer:
224
- answers.append({
225
- 'answer': answer,
226
- 'source': result['url'],
227
- 'confidence': self._calculate_confidence(answer, analysis)
228
- })
229
-
230
- # Step 4: Return the best answer
231
- if not answers:
232
- return {"status": "No answers found"}
233
-
234
- answers.sort(key=lambda x: x['confidence'], reverse=True)
235
- return {
236
- "question": question,
237
- "best_answer": answers[0]['answer'],
238
- "source": answers[0]['source'],
239
- "confidence": answers[0]['confidence'],
240
- "all_answers": answers
241
- }
242
-
243
  def _calculate_confidence(self, answer, analysis):
244
  """Calculate confidence score for an answer"""
245
  confidence = 0.5 # Base confidence
246
 
247
- # Type matching
248
  if analysis['intent'] == 'count' and isinstance(answer, int):
249
  confidence += 0.3
250
  elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
@@ -252,7 +248,6 @@ class WebSearchAgent:
252
  elif analysis['intent'] == 'list' and isinstance(answer, list):
253
  confidence += 0.3
254
 
255
- # Time constraints
256
  if analysis['time_constraints'] and str(answer):
257
  for constraint in analysis['time_constraints']:
258
  if constraint[0] == 'range':
@@ -260,33 +255,25 @@ class WebSearchAgent:
260
  if any(constraint[1] <= int(y) <= constraint[2] for y in years):
261
  confidence += 0.2
262
 
263
- return min(0.99, max(0.1, confidence)) # Keep within bounds
264
 
265
- # Example usage
266
  if __name__ == "__main__":
267
- agent = WebSearchAgent()
268
 
269
  questions = [
270
- "How many studio albums were published by Taylor Swift between 2010 and 2015?",
271
- "When was Albert Einstein born?",
272
- "What is the capital of Australia?",
273
- "List the members of The Beatles"
274
  ]
275
 
276
  for question in questions:
277
- print("\n" + "="*50)
278
- print(f"Question: {question}")
279
  result = agent.answer_question(question)
280
-
281
- print("\nBest Answer:")
282
- if isinstance(result['best_answer'], list):
283
- for item in result['best_answer']:
284
- print(f"- {item}")
285
- else:
286
- print(result['best_answer'])
287
-
288
- print(f"\nSource: {result['source']}")
289
- print(f"Confidence: {result['confidence']:.0%}")
290
 
291
 
292
 
 
11
  from bs4 import BeautifulSoup
12
  import re
13
  from urllib.parse import quote
14
+ import requests
15
+ from urllib.parse import quote
16
  from googlesearch import search
17
 
18
  load_dotenv()
 
27
  print("BasicAgent initialized.")
28
  def __call__(self, question: str) -> str:
29
  print(f"Agent received question (first 50 chars): {question[:50]}...")
30
+ fixed_answer = agent.answer_question({question})
31
  print(f"Agent returning fixed answer: {fixed_answer}")
32
  return fixed_answer
33
 
34
+
35
+
36
+ class BasicAgent:
37
  def __init__(self):
 
38
  self.session = requests.Session()
39
  self.session.headers.update({
40
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
42
  self.cache = {}
43
 
44
  def analyze_query(self, query):
45
+ """Simplified query analysis without spaCy"""
 
 
46
  analysis = {
47
+ 'entities': self._extract_entities(query),
48
  'intent': self._determine_intent(query.lower()),
49
  'time_constraints': self._extract_time_constraints(query),
50
  'quantities': self._extract_quantities(query)
51
  }
52
  return analysis
53
 
54
+ def _extract_entities(self, text):
55
+ """Simple entity extraction using patterns"""
56
+ # Extract capitalized phrases (crude named entity recognition)
57
+ entities = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text)
58
+ return [(ent, 'UNKNOWN') for ent in entities if len(ent.split()) < 4]
59
+
60
  def _determine_intent(self, query):
61
+ """Determine intent using keyword matching"""
62
  if 'how many' in query:
63
  return 'count'
64
  elif 'when' in query:
 
74
  def _extract_time_constraints(self, text):
75
  """Extract time ranges from text"""
76
  constraints = []
 
77
  range_match = re.search(r'between (\d{4}) and (\d{4})', text)
78
  if range_match:
79
  constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
80
 
 
81
  year_match = re.search(r'in (\d{4})', text)
82
  if year_match:
83
  constraints.append(('point', int(year_match.group(1))))
 
89
  return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
90
 
91
  def search_web(self, query, num_results=3):
92
+ """Search the web using Google and Wikipedia"""
 
 
 
 
 
93
  results = []
 
 
 
 
 
94
 
95
+ # Google search
96
+ try:
97
+ results.extend({
98
+ 'url': url,
99
+ 'source': 'google'
100
+ } for url in search(query, num_results=num_results, stop=num_results))
101
+ except Exception as e:
102
+ print(f"Google search error: {e}")
103
+
104
+ # Wikipedia search
105
+ try:
106
+ wiki_url = "https://en.wikipedia.org/w/api.php"
107
+ params = {
108
+ 'action': 'query',
109
+ 'list': 'search',
110
+ 'srsearch': query,
111
+ 'format': 'json',
112
+ 'srlimit': num_results
113
+ }
114
+ response = self.session.get(wiki_url, params=params).json()
115
+ results.extend({
116
+ 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
117
+ 'title': item['title'],
118
+ 'snippet': item['snippet'],
119
+ 'source': 'wikipedia'
120
+ } for item in response['query']['search'])
121
+ except Exception as e:
122
+ print(f"Wikipedia search error: {e}")
123
+
124
+ return results[:num_results*2]
125
 
126
  def fetch_page(self, url):
127
  """Fetch and parse a web page with caching"""
 
149
  print(f"Error fetching {url}: {e}")
150
  return None
151
 
152
+ def answer_question(self, question, num_sources=3):
153
+ """Main method to answer a question"""
154
+ print(f"\nQuestion: {question}")
 
155
 
156
+ # Step 1: Analyze the question
157
+ analysis = self.analyze_query(question)
158
+ print(f"Analysis: {analysis}")
159
+
160
+ # Step 2: Search the web
161
+ search_results = self.search_web(question, num_sources)
162
+ print(f"Found {len(search_results)} potential sources")
163
+
164
+ # Step 3: Fetch and analyze pages
165
+ answers = []
166
+ for result in search_results:
167
+ page = self.fetch_page(result['url'])
168
+ if page:
169
+ answer = self._extract_answer(page, analysis)
170
+ if answer:
171
+ answers.append({
172
+ 'answer': answer,
173
+ 'source': result['url'],
174
+ 'confidence': self._calculate_confidence(answer, analysis)
175
+ })
176
+
177
+ # Step 4: Return the best answer
178
+ if not answers:
179
+ return {"answer": "No answers found", "source": None}
180
+
181
+ answers.sort(key=lambda x: x['confidence'], reverse=True)
182
+ best_answer = answers[0]
183
+
184
+ # Format the output
185
+ result = {
186
+ "question": question,
187
+ "answer": best_answer['answer'],
188
+ "source": best_answer['source'],
189
+ "confidence": f"{best_answer['confidence']:.0%}"
190
+ }
191
+
192
+ if isinstance(best_answer['answer'], list):
193
+ result['answer'] = "\n- " + "\n- ".join(best_answer['answer'])
194
+
195
+ return result
196
+
197
+ def _extract_answer(self, page, analysis):
198
+ """Extract answer based on intent"""
199
  if analysis['intent'] == 'count':
200
  return self._extract_count(page['text'], analysis)
201
  elif analysis['intent'] == 'date':
 
210
  entities = [e[0] for e in analysis['entities']]
211
  pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
212
  matches = re.finditer(pattern, text, re.IGNORECASE)
213
+ counts = [int(match.group(1))) for match in matches]
 
214
  return max(counts) if counts else None
215
 
216
  def _extract_date(self, text, analysis):
217
  """Extract dates from text"""
218
  date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
219
  dates = [match.group(0) for match in re.finditer(date_pattern, text)]
 
220
  entities = [e[0] for e in analysis['entities']]
221
  return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
222
 
 
224
  """Extract list items from page"""
225
  entities = [e[0] for e in analysis['entities']]
226
  items = []
 
227
  for list_tag in soup.find_all(['ul', 'ol']):
228
  list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
229
  if any(e.lower() in ' '.join(list_items).lower() for e in entities):
230
  items.extend(list_items)
 
231
  return items if items else None
232
 
233
  def _extract_general(self, text, analysis):
234
  """Extract general information from text"""
235
  entities = [e[0] for e in analysis['entities']]
236
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
 
237
  relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
238
  return ' '.join(relevant) if relevant else None
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  def _calculate_confidence(self, answer, analysis):
241
  """Calculate confidence score for an answer"""
242
  confidence = 0.5 # Base confidence
243
 
 
244
  if analysis['intent'] == 'count' and isinstance(answer, int):
245
  confidence += 0.3
246
  elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
 
248
  elif analysis['intent'] == 'list' and isinstance(answer, list):
249
  confidence += 0.3
250
 
 
251
  if analysis['time_constraints'] and str(answer):
252
  for constraint in analysis['time_constraints']:
253
  if constraint[0] == 'range':
 
255
  if any(constraint[1] <= int(y) <= constraint[2] for y in years):
256
  confidence += 0.2
257
 
258
+ return min(0.99, max(0.1, confidence))
259
 
260
+ # Example usage
261
  if __name__ == "__main__":
262
+ agent = SimpleWebSearchAgent()
263
 
264
  questions = [
265
+ "How many studio albums did Taylor Swift release between 2010 and 2015?",
266
+ "When was the first iPhone released?",
267
+ "What is the capital of Canada?",
268
+ "List the planets in our solar system"
269
  ]
270
 
271
  for question in questions:
 
 
272
  result = agent.answer_question(question)
273
+ print(f"\nAnswer: {result['answer']}")
274
+ #print(f"Source: {result['source']}")
275
+ #print(f"Confidence: {result['confidence']}")
276
+ #print("="*50)
 
 
 
 
 
 
277
 
278
 
279