wt002 commited on
Commit
4f47377
·
verified ·
1 Parent(s): 8289a44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -60
app.py CHANGED
@@ -11,9 +11,6 @@ import requests
11
  from bs4 import BeautifulSoup
12
  import re
13
  from urllib.parse import quote
14
- import requests
15
- from urllib.parse import quote
16
- from googlesearch import search
17
 
18
  load_dotenv()
19
 
@@ -25,6 +22,11 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
25
  class BasicAgent:
26
  def __init__(self):
27
  print("BasicAgent initialized.")
 
 
 
 
 
28
  def __call__(self, question: str) -> str:
29
  print(f"Agent received question (first 50 chars): {question[:50]}...")
30
  fixed_answer = agent.answer_question({question})
@@ -32,36 +34,27 @@ class BasicAgent:
32
  return fixed_answer
33
 
34
 
35
-
36
- class BasicAgent:
37
- def __init__(self):
38
- self.session = requests.Session()
39
- self.session.headers.update({
40
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
41
- })
42
- self.cache = {}
43
-
44
  def analyze_query(self, query):
45
- """Simplified query analysis without spaCy"""
46
- analysis = {
47
  'entities': self._extract_entities(query),
48
  'intent': self._determine_intent(query.lower()),
49
  'time_constraints': self._extract_time_constraints(query),
50
  'quantities': self._extract_quantities(query)
51
  }
52
- return analysis
53
 
54
  def _extract_entities(self, text):
55
- """Simple entity extraction using patterns"""
56
- # Extract capitalized phrases (crude named entity recognition)
57
- entities = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text)
58
- return [(ent, 'UNKNOWN') for ent in entities if len(ent.split()) < 4]
 
59
 
60
  def _determine_intent(self, query):
61
- """Determine intent using keyword matching"""
62
  if 'how many' in query:
63
  return 'count'
64
- elif 'when' in query:
65
  return 'date'
66
  elif 'who' in query:
67
  return 'person'
@@ -72,12 +65,14 @@ class BasicAgent:
72
  return 'general'
73
 
74
  def _extract_time_constraints(self, text):
75
- """Extract time ranges from text"""
76
  constraints = []
 
77
  range_match = re.search(r'between (\d{4}) and (\d{4})', text)
78
  if range_match:
79
  constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
80
 
 
81
  year_match = re.search(r'in (\d{4})', text)
82
  if year_match:
83
  constraints.append(('point', int(year_match.group(1))))
@@ -85,46 +80,33 @@ class BasicAgent:
85
  return constraints
86
 
87
  def _extract_quantities(self, text):
88
- """Extract numerical quantities from text"""
89
  return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
90
 
91
- def search_web(self, query, num_results=3):
92
- """Search the web using Google and Wikipedia"""
93
- results = []
94
-
95
- # Google search
96
- try:
97
- results.extend({
98
- 'url': url,
99
- 'source': 'google'
100
- } for url in search(query, num_results=num_results, stop=num_results))
101
- except Exception as e:
102
- print(f"Google search error: {e}")
103
-
104
- # Wikipedia search
105
  try:
106
- wiki_url = "https://en.wikipedia.org/w/api.php"
107
- params = {
108
- 'action': 'query',
109
- 'list': 'search',
110
- 'srsearch': query,
111
- 'format': 'json',
112
- 'srlimit': num_results
113
- }
114
- response = self.session.get(wiki_url, params=params).json()
115
- results.extend({
116
  'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
117
  'title': item['title'],
118
  'snippet': item['snippet'],
119
  'source': 'wikipedia'
120
- } for item in response['query']['search'])
121
  except Exception as e:
122
  print(f"Wikipedia search error: {e}")
123
-
124
- return results[:num_results*2]
125
 
126
  def fetch_page(self, url):
127
- """Fetch and parse a web page with caching"""
128
  if url in self.cache:
129
  return self.cache[url]
130
 
@@ -133,7 +115,7 @@ class BasicAgent:
133
  soup = BeautifulSoup(response.text, 'html.parser')
134
 
135
  # Clean the page content
136
- for element in soup(['script', 'style', 'nav', 'footer']):
137
  element.decompose()
138
 
139
  page_data = {
@@ -149,17 +131,18 @@ class BasicAgent:
149
  print(f"Error fetching {url}: {e}")
150
  return None
151
 
152
- def answer_question(self, question, num_sources=3):
153
- """Main method to answer a question"""
154
  print(f"\nQuestion: {question}")
155
 
156
  # Step 1: Analyze the question
157
  analysis = self.analyze_query(question)
158
  print(f"Analysis: {analysis}")
159
 
160
- # Step 2: Search the web
161
- search_results = self.search_web(question, num_sources)
162
- print(f"Found {len(search_results)} potential sources")
 
163
 
164
  # Step 3: Fetch and analyze pages
165
  answers = []
@@ -176,7 +159,7 @@ class BasicAgent:
176
 
177
  # Step 4: Return the best answer
178
  if not answers:
179
- return {"answer": "No answers found", "source": None}
180
 
181
  answers.sort(key=lambda x: x['confidence'], reverse=True)
182
  best_answer = answers[0]
@@ -210,7 +193,7 @@ class BasicAgent:
210
  entities = [e[0] for e in analysis['entities']]
211
  pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
212
  matches = re.finditer(pattern, text, re.IGNORECASE)
213
- counts = [int(match.group(1)) for match in matches]
214
  return max(counts) if counts else None
215
 
216
  def _extract_date(self, text, analysis):
@@ -259,7 +242,7 @@ class BasicAgent:
259
 
260
  # Example usage
261
  if __name__ == "__main__":
262
- agent = SimpleWebSearchAgent()
263
 
264
  questions = [
265
  "How many studio albums did Taylor Swift release between 2010 and 2015?",
 
11
  from bs4 import BeautifulSoup
12
  import re
13
  from urllib.parse import quote
 
 
 
14
 
15
  load_dotenv()
16
 
 
22
  class BasicAgent:
23
  def __init__(self):
24
  print("BasicAgent initialized.")
25
+ self.session = requests.Session()
26
+ self.session.headers.update({
27
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
28
+ })
29
+ self.cache = {}
30
  def __call__(self, question: str) -> str:
31
  print(f"Agent received question (first 50 chars): {question[:50]}...")
32
  fixed_answer = agent.answer_question({question})
 
34
  return fixed_answer
35
 
36
 
 
 
 
 
 
 
 
 
 
37
  def analyze_query(self, query):
38
+ """Analyze the query using regex patterns"""
39
+ return {
40
  'entities': self._extract_entities(query),
41
  'intent': self._determine_intent(query.lower()),
42
  'time_constraints': self._extract_time_constraints(query),
43
  'quantities': self._extract_quantities(query)
44
  }
 
45
 
46
  def _extract_entities(self, text):
47
+ """Simple entity extraction using capitalization patterns"""
48
+ # Find proper nouns (capitalized phrases)
49
+ entities = re.findall(r'([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)', text)
50
+ # Filter out small words and standalone letters
51
+ return [(ent, 'UNKNOWN') for ent in entities if len(ent) > 2 and ' ' in ent]
52
 
53
  def _determine_intent(self, query):
54
+ """Determine intent using keyword patterns"""
55
  if 'how many' in query:
56
  return 'count'
57
+ elif 'when' in query or 'date' in query:
58
  return 'date'
59
  elif 'who' in query:
60
  return 'person'
 
65
  return 'general'
66
 
67
  def _extract_time_constraints(self, text):
68
+ """Extract year ranges from text"""
69
  constraints = []
70
+ # Match patterns like "between 2000 and 2009"
71
  range_match = re.search(r'between (\d{4}) and (\d{4})', text)
72
  if range_match:
73
  constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
74
 
75
+ # Match patterns like "in 2005"
76
  year_match = re.search(r'in (\d{4})', text)
77
  if year_match:
78
  constraints.append(('point', int(year_match.group(1))))
 
80
  return constraints
81
 
82
  def _extract_quantities(self, text):
83
+ """Extract numbers from text"""
84
  return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
85
 
86
+ def search_wikipedia(self, query, num_results=3):
87
+ """Search Wikipedia's API"""
88
+ url = "https://en.wikipedia.org/w/api.php"
89
+ params = {
90
+ 'action': 'query',
91
+ 'list': 'search',
92
+ 'srsearch': query,
93
+ 'format': 'json',
94
+ 'srlimit': num_results
95
+ }
 
 
 
 
96
  try:
97
+ response = self.session.get(url, params=params).json()
98
+ return [{
 
 
 
 
 
 
 
 
99
  'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
100
  'title': item['title'],
101
  'snippet': item['snippet'],
102
  'source': 'wikipedia'
103
+ } for item in response['query']['search']]
104
  except Exception as e:
105
  print(f"Wikipedia search error: {e}")
106
+ return []
 
107
 
108
  def fetch_page(self, url):
109
+ """Fetch and parse a Wikipedia page"""
110
  if url in self.cache:
111
  return self.cache[url]
112
 
 
115
  soup = BeautifulSoup(response.text, 'html.parser')
116
 
117
  # Clean the page content
118
+ for element in soup(['script', 'style', 'nav', 'footer', 'table']):
119
  element.decompose()
120
 
121
  page_data = {
 
131
  print(f"Error fetching {url}: {e}")
132
  return None
133
 
134
+ def answer_question(self, question):
135
+ """Answer a question using Wikipedia"""
136
  print(f"\nQuestion: {question}")
137
 
138
  # Step 1: Analyze the question
139
  analysis = self.analyze_query(question)
140
  print(f"Analysis: {analysis}")
141
 
142
+ # Step 2: Search Wikipedia
143
+ search_results = self.search_wikipedia(question)
144
+ if not search_results:
145
+ return {"answer": "No Wikipedia results found", "source": None}
146
 
147
  # Step 3: Fetch and analyze pages
148
  answers = []
 
159
 
160
  # Step 4: Return the best answer
161
  if not answers:
162
+ return {"answer": "No answers found in Wikipedia", "source": None}
163
 
164
  answers.sort(key=lambda x: x['confidence'], reverse=True)
165
  best_answer = answers[0]
 
193
  entities = [e[0] for e in analysis['entities']]
194
  pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
195
  matches = re.finditer(pattern, text, re.IGNORECASE)
196
+ counts = [int(match.group(1))) for match in matches]
197
  return max(counts) if counts else None
198
 
199
  def _extract_date(self, text, analysis):
 
242
 
243
  # Example usage
244
  if __name__ == "__main__":
245
+ agent = BasicAgent()
246
 
247
  questions = [
248
  "How many studio albums did Taylor Swift release between 2010 and 2015?",