wt002 commited on
Commit
cda9f5c
·
verified ·
1 Parent(s): 465be66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -219
app.py CHANGED
@@ -6,11 +6,11 @@ import requests
6
  from typing import List, Dict, Union
7
  import requests
8
  import wikipediaapi
9
- import pandas as pd
 
10
  import requests
11
- from bs4 import BeautifulSoup
12
- import re
13
- from urllib.parse import quote
14
 
15
  load_dotenv()
16
 
@@ -18,242 +18,125 @@ load_dotenv()
18
  # --- Constants ---
19
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
20
 
 
21
  # --- Basic Agent Definition ---
22
  class BasicAgent:
23
- def __init__(self):
 
 
24
  print("BasicAgent initialized.")
25
- self.session = requests.Session()
26
- self.session.headers.update({
27
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
28
- })
29
- self.cache = {}
30
-
31
  def __call__(self, question: str) -> str:
32
  print(f"Agent received question (first 50 chars): {question[:50]}...")
33
- fixed_answer = self.agent.answer_question({question})
34
- print(f"Agent returning fixed answer: {fixed_answer}")
35
  return fixed_answer
36
 
37
 
38
- def analyze_query(self, query):
39
- """Analyze the query using regex patterns"""
40
- return {
41
- 'entities': self._extract_entities(query),
42
- 'intent': self._determine_intent(query.lower()),
43
- 'time_constraints': self._extract_time_constraints(query),
44
- 'quantities': self._extract_quantities(query)
45
- }
46
 
47
- def _extract_entities(self, text):
48
- """Simple entity extraction using capitalization patterns"""
49
- # Find proper nouns (capitalized phrases)
50
- entities = re.findall(r'([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)', text)
51
- # Filter out small words and standalone letters
52
- return [(ent, 'UNKNOWN') for ent in entities if len(ent) > 2 and ' ' in ent]
53
-
54
- def _determine_intent(self, query):
55
- """Determine intent using keyword patterns"""
56
- if 'how many' in query:
57
- return 'count'
58
- elif 'when' in query or 'date' in query:
59
- return 'date'
60
- elif 'who' in query:
61
- return 'person'
62
- elif 'what is' in query or 'define' in query:
63
- return 'definition'
64
- elif 'list' in query or 'name all' in query:
65
- return 'list'
66
- return 'general'
67
-
68
- def _extract_time_constraints(self, text):
69
- """Extract year ranges from text"""
70
- constraints = []
71
- # Match patterns like "between 2000 and 2009"
72
- range_match = re.search(r'between (\d{4}) and (\d{4})', text)
73
- if range_match:
74
- constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
75
-
76
- # Match patterns like "in 2005"
77
- year_match = re.search(r'in (\d{4})', text)
78
- if year_match:
79
- constraints.append(('point', int(year_match.group(1))))
80
-
81
- return constraints
82
 
83
- def _extract_quantities(self, text):
84
- """Extract numbers from text"""
85
- return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
86
 
87
- def search_wikipedia(self, query, num_results=3):
88
- """Search Wikipedia's API"""
89
- url = "https://en.wikipedia.org/w/api.php"
90
  params = {
91
- 'action': 'query',
92
- 'list': 'search',
93
- 'srsearch': query,
94
- 'format': 'json',
95
- 'srlimit': num_results
96
  }
97
  try:
98
- response = self.session.get(url, params=params).json()
99
- return [{
100
- 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
101
- 'title': item['title'],
102
- 'snippet': item['snippet'],
103
- 'source': 'wikipedia'
104
- } for item in response['query']['search']]
105
- except Exception as e:
106
- print(f"Wikipedia search error: {e}")
107
  return []
108
 
109
- def fetch_page(self, url):
110
- """Fetch and parse a Wikipedia page"""
111
- if url in self.cache:
112
- return self.cache[url]
 
 
 
 
 
 
 
113
 
114
  try:
115
- response = self.session.get(url, timeout=10)
116
- soup = BeautifulSoup(response.text, 'html.parser')
117
-
118
- # Clean the page content
119
- for element in soup(['script', 'style', 'nav', 'footer', 'table']):
120
- element.decompose()
121
-
122
- page_data = {
123
- 'url': url,
124
- 'title': soup.title.string if soup.title else '',
125
- 'text': ' '.join(soup.stripped_strings),
126
- 'soup': soup
127
- }
128
-
129
- self.cache[url] = page_data
130
- return page_data
131
  except Exception as e:
132
- print(f"Error fetching {url}: {e}")
133
- return None
134
 
135
- def answer_question(self, question):
136
- """Answer a question using Wikipedia"""
137
- print(f"\nQuestion: {question}")
138
-
139
- # Step 1: Analyze the question
140
- analysis = self.analyze_query(question)
141
- print(f"Analysis: {analysis}")
142
-
143
- # Step 2: Search Wikipedia
144
- search_results = self.search_wikipedia(question)
145
- if not search_results:
146
- return {"answer": "No Wikipedia results found", "source": None}
147
-
148
- # Step 3: Fetch and analyze pages
149
- answers = []
150
- for result in search_results:
151
- page = self.fetch_page(result['url'])
152
- if page:
153
- answer = self._extract_answer(page, analysis)
154
- if answer:
155
- answers.append({
156
- 'answer': answer,
157
- 'source': result['url'],
158
- 'confidence': self._calculate_confidence(answer, analysis)
159
- })
160
-
161
- # Step 4: Return the best answer
162
- if not answers:
163
- return {"answer": "No answers found in Wikipedia", "source": None}
164
-
165
- answers.sort(key=lambda x: x['confidence'], reverse=True)
166
- best_answer = answers[0]
167
-
168
- # Format the output
169
- result = {
170
- "question": question,
171
- "answer": best_answer['answer'],
172
- "source": best_answer['source'],
173
- "confidence": f"{best_answer['confidence']:.0%}"
174
- }
175
-
176
- if isinstance(best_answer['answer'], list):
177
- result['answer'] = "\n- " + "\n- ".join(best_answer['answer'])
178
-
179
- return result
180
-
181
- def _extract_answer(self, page, analysis):
182
- """Extract answer based on intent"""
183
- if analysis['intent'] == 'count':
184
- return self._extract_count(page['text'], analysis)
185
- elif analysis['intent'] == 'date':
186
- return self._extract_date(page['text'], analysis)
187
- elif analysis['intent'] == 'list':
188
- return self._extract_list(page['soup'], analysis)
189
- else:
190
- return self._extract_general(page['text'], analysis)
191
-
192
- def _extract_count(self, text, analysis):
193
- """Extract a count/number from text"""
194
- entities = [e[0] for e in analysis['entities']]
195
- pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
196
- matches = re.finditer(pattern, text, re.IGNORECASE)
197
- counts = [int(match.group(1)) for match in matches]
198
- return max(counts) if counts else None
199
-
200
- def _extract_date(self, text, analysis):
201
- """Extract dates from text"""
202
- date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
203
- dates = [match.group(0) for match in re.finditer(date_pattern, text)]
204
- entities = [e[0] for e in analysis['entities']]
205
- return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
206
-
207
- def _extract_list(self, soup, analysis):
208
- """Extract list items from page"""
209
- entities = [e[0] for e in analysis['entities']]
210
- items = []
211
- for list_tag in soup.find_all(['ul', 'ol']):
212
- list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
213
- if any(e.lower() in ' '.join(list_items).lower() for e in entities):
214
- items.extend(list_items)
215
- return items if items else None
216
-
217
- def _extract_general(self, text, analysis):
218
- """Extract general information from text"""
219
- entities = [e[0] for e in analysis['entities']]
220
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
221
- relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
222
- return ' '.join(relevant) if relevant else None
223
-
224
- def _calculate_confidence(self, answer, analysis):
225
- """Calculate confidence score for an answer"""
226
- confidence = 0.5 # Base confidence
227
-
228
- if analysis['intent'] == 'count' and isinstance(answer, int):
229
- confidence += 0.3
230
- elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
231
- confidence += 0.3
232
- elif analysis['intent'] == 'list' and isinstance(answer, list):
233
- confidence += 0.3
234
-
235
- if analysis['time_constraints'] and str(answer):
236
- for constraint in analysis['time_constraints']:
237
- if constraint[0] == 'range':
238
- years = re.findall(r'\b(19|20)\d{2}\b', str(answer))
239
- if any(constraint[1] <= int(y) <= constraint[2] for y in years):
240
- confidence += 0.2
241
-
242
- return min(0.99, max(0.1, confidence))
243
 
244
- if __name__ == "__main__":
245
- agent = BasicAgent()
246
-
247
- questions = [
248
- "How many studio albums did Taylor Swift release between 2010 and 2015?",
249
- "When was the first iPhone released?",
250
- "What is the capital of Canada?",
251
- "List the planets in our solar system"
252
- ]
253
-
254
- for question in questions:
255
- result = agent.answer_question(question)
256
- print(f"\nAnswer: {result['answer']}")
 
 
 
 
 
 
257
 
258
  def run_and_submit_all( profile: gr.OAuthProfile | None):
259
  """
 
6
  from typing import List, Dict, Union
7
  import requests
8
  import wikipediaapi
9
+ import google.generativeai as genai
10
+ from typing import List, Dict, Union
11
  import requests
12
+ import wikipediaapi
13
+ import pandas as pd
 
14
 
15
  load_dotenv()
16
 
 
18
  # --- Constants ---
19
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
20
 
21
+
22
  # --- Basic Agent Definition ---
23
  class BasicAgent:
24
+ def __init__(self, model="google/gemma-7b"):
25
+ self.api_url = f"https://api-inference.huggingface.co/models/{model}"
26
+ self.headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
27
  print("BasicAgent initialized.")
28
+
29
+ #usage
30
+ #agent = HuggingFaceAgent("google/gemma-7b") # Same architecture as Gemini
31
+ #print(agent.generate("Explain quantum computing"))
32
+
33
+
34
  def __call__(self, question: str) -> str:
35
  print(f"Agent received question (first 50 chars): {question[:50]}...")
36
+ fixed_answer = self.agent.generate(question)
37
+ print(f"Agent returning answer: {fixed_answer}")
38
  return fixed_answer
39
 
40
 
41
+ # to check
42
+ def generate_response(self, prompt: str) -> str:
43
+ """Get response from Gema"""
44
+ try:
45
+ response = self.model.generate_content(prompt)
46
+ return response.text
47
+ except Exception as e:
48
+ return f"Error generating response: {str(e)}"
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
51
 
52
+ def web_search(self, query: str) -> List[Dict]:
53
+ """Use SearxNG meta-search engine"""
 
54
  params = {
55
+ "q": query,
56
+ "format": "json",
57
+ "engines": "google,bing,duckduckgo"
 
 
58
  }
59
  try:
60
+ response = requests.get(self.searx_url, params=params)
61
+ response.raise_for_status()
62
+ return response.json().get("results", [])
63
+ except requests.RequestException:
 
 
 
 
 
64
  return []
65
 
66
+ def wikipedia_search(self, query: str) -> str:
67
+ """Get Wikipedia summary"""
68
+ page = self.wiki.page(query)
69
+ return page.summary if page.exists() else "No Wikipedia page found"
70
+
71
+ def process_document(self, file_path: str) -> str:
72
+ """Handle PDF, Word, CSV, Excel files"""
73
+ if not os.path.exists(file_path):
74
+ return "File not found"
75
+
76
+ ext = os.path.splitext(file_path)[1].lower()
77
 
78
  try:
79
+ if ext == '.pdf':
80
+ return self._process_pdf(file_path)
81
+ elif ext in ('.doc', '.docx'):
82
+ return self._process_word(file_path)
83
+ elif ext == '.csv':
84
+ return pd.read_csv(file_path).to_string()
85
+ elif ext in ('.xls', '.xlsx'):
86
+ return pd.read_excel(file_path).to_string()
87
+ else:
88
+ return "Unsupported file format"
 
 
 
 
 
 
89
  except Exception as e:
90
+ return f"Error processing document: {str(e)}"
 
91
 
92
+ def _process_pdf(self, file_path: str) -> str:
93
+ """Process PDF using Gemini's vision capability"""
94
+ try:
95
+ # For Gemini 1.5 or later which supports file uploads
96
+ with open(file_path, "rb") as f:
97
+ file = genai.upload_file(f)
98
+ response = self.model.generate_content(
99
+ ["Extract and summarize the key points from this document:", file]
100
+ )
101
+ return response.text
102
+ except:
103
+ # Fallback for older Gemini versions
104
+ try:
105
+ import PyPDF2
106
+ with open(file_path, 'rb') as f:
107
+ reader = PyPDF2.PdfReader(f)
108
+ return "\n".join([page.extract_text() for page in reader.pages])
109
+ except ImportError:
110
+ return "PDF processing requires PyPDF2 (pip install PyPDF2)"
111
+
112
+ def _process_word(self, file_path: str) -> str:
113
+ """Process Word documents"""
114
+ try:
115
+ from docx import Document
116
+ doc = Document(file_path)
117
+ return "\n".join([para.text for para in doc.paragraphs])
118
+ except ImportError:
119
+ return "Word processing requires python-docx (pip install python-docx)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def process_request(self, request: Union[str, Dict]) -> str:
122
+ """
123
+ Handle different request types:
124
+ - Direct text queries
125
+ - File processing requests
126
+ - Complex multi-step requests
127
+ """
128
+ if isinstance(request, dict):
129
+ if 'steps' in request:
130
+ results = []
131
+ for step in request['steps']:
132
+ if step['type'] == 'search':
133
+ results.append(self.web_search(step['query']))
134
+ elif step['type'] == 'process':
135
+ results.append(self.process_document(step['file']))
136
+ return self.generate_response(f"Process these results: {results}")
137
+ return "Unsupported request format"
138
+
139
+ return self.generate_response(request)
140
 
141
  def run_and_submit_all( profile: gr.OAuthProfile | None):
142
  """