Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,11 +6,11 @@ import requests
|
|
6 |
from typing import List, Dict, Union
|
7 |
import requests
|
8 |
import wikipediaapi
|
9 |
-
import
|
|
|
10 |
import requests
|
11 |
-
|
12 |
-
import
|
13 |
-
from urllib.parse import quote
|
14 |
|
15 |
load_dotenv()
|
16 |
|
@@ -18,242 +18,125 @@ load_dotenv()
|
|
18 |
# --- Constants ---
|
19 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
20 |
|
|
|
21 |
# --- Basic Agent Definition ---
|
22 |
class BasicAgent:
|
23 |
-
def __init__(self):
|
|
|
|
|
24 |
print("BasicAgent initialized.")
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
def __call__(self, question: str) -> str:
|
32 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
33 |
-
fixed_answer = self.agent.
|
34 |
-
print(f"Agent returning
|
35 |
return fixed_answer
|
36 |
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
def _extract_entities(self, text):
|
48 |
-
"""Simple entity extraction using capitalization patterns"""
|
49 |
-
# Find proper nouns (capitalized phrases)
|
50 |
-
entities = re.findall(r'([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)', text)
|
51 |
-
# Filter out small words and standalone letters
|
52 |
-
return [(ent, 'UNKNOWN') for ent in entities if len(ent) > 2 and ' ' in ent]
|
53 |
-
|
54 |
-
def _determine_intent(self, query):
|
55 |
-
"""Determine intent using keyword patterns"""
|
56 |
-
if 'how many' in query:
|
57 |
-
return 'count'
|
58 |
-
elif 'when' in query or 'date' in query:
|
59 |
-
return 'date'
|
60 |
-
elif 'who' in query:
|
61 |
-
return 'person'
|
62 |
-
elif 'what is' in query or 'define' in query:
|
63 |
-
return 'definition'
|
64 |
-
elif 'list' in query or 'name all' in query:
|
65 |
-
return 'list'
|
66 |
-
return 'general'
|
67 |
-
|
68 |
-
def _extract_time_constraints(self, text):
|
69 |
-
"""Extract year ranges from text"""
|
70 |
-
constraints = []
|
71 |
-
# Match patterns like "between 2000 and 2009"
|
72 |
-
range_match = re.search(r'between (\d{4}) and (\d{4})', text)
|
73 |
-
if range_match:
|
74 |
-
constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
|
75 |
-
|
76 |
-
# Match patterns like "in 2005"
|
77 |
-
year_match = re.search(r'in (\d{4})', text)
|
78 |
-
if year_match:
|
79 |
-
constraints.append(('point', int(year_match.group(1))))
|
80 |
-
|
81 |
-
return constraints
|
82 |
|
83 |
-
def _extract_quantities(self, text):
|
84 |
-
"""Extract numbers from text"""
|
85 |
-
return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
|
86 |
|
87 |
-
def
|
88 |
-
"""
|
89 |
-
url = "https://en.wikipedia.org/w/api.php"
|
90 |
params = {
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
'format': 'json',
|
95 |
-
'srlimit': num_results
|
96 |
}
|
97 |
try:
|
98 |
-
response =
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
'snippet': item['snippet'],
|
103 |
-
'source': 'wikipedia'
|
104 |
-
} for item in response['query']['search']]
|
105 |
-
except Exception as e:
|
106 |
-
print(f"Wikipedia search error: {e}")
|
107 |
return []
|
108 |
|
109 |
-
def
|
110 |
-
"""
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
try:
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
'text': ' '.join(soup.stripped_strings),
|
126 |
-
'soup': soup
|
127 |
-
}
|
128 |
-
|
129 |
-
self.cache[url] = page_data
|
130 |
-
return page_data
|
131 |
except Exception as e:
|
132 |
-
|
133 |
-
return None
|
134 |
|
135 |
-
def
|
136 |
-
"""
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
return {"answer": "No answers found in Wikipedia", "source": None}
|
164 |
-
|
165 |
-
answers.sort(key=lambda x: x['confidence'], reverse=True)
|
166 |
-
best_answer = answers[0]
|
167 |
-
|
168 |
-
# Format the output
|
169 |
-
result = {
|
170 |
-
"question": question,
|
171 |
-
"answer": best_answer['answer'],
|
172 |
-
"source": best_answer['source'],
|
173 |
-
"confidence": f"{best_answer['confidence']:.0%}"
|
174 |
-
}
|
175 |
-
|
176 |
-
if isinstance(best_answer['answer'], list):
|
177 |
-
result['answer'] = "\n- " + "\n- ".join(best_answer['answer'])
|
178 |
-
|
179 |
-
return result
|
180 |
-
|
181 |
-
def _extract_answer(self, page, analysis):
|
182 |
-
"""Extract answer based on intent"""
|
183 |
-
if analysis['intent'] == 'count':
|
184 |
-
return self._extract_count(page['text'], analysis)
|
185 |
-
elif analysis['intent'] == 'date':
|
186 |
-
return self._extract_date(page['text'], analysis)
|
187 |
-
elif analysis['intent'] == 'list':
|
188 |
-
return self._extract_list(page['soup'], analysis)
|
189 |
-
else:
|
190 |
-
return self._extract_general(page['text'], analysis)
|
191 |
-
|
192 |
-
def _extract_count(self, text, analysis):
|
193 |
-
"""Extract a count/number from text"""
|
194 |
-
entities = [e[0] for e in analysis['entities']]
|
195 |
-
pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
|
196 |
-
matches = re.finditer(pattern, text, re.IGNORECASE)
|
197 |
-
counts = [int(match.group(1)) for match in matches]
|
198 |
-
return max(counts) if counts else None
|
199 |
-
|
200 |
-
def _extract_date(self, text, analysis):
|
201 |
-
"""Extract dates from text"""
|
202 |
-
date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
|
203 |
-
dates = [match.group(0) for match in re.finditer(date_pattern, text)]
|
204 |
-
entities = [e[0] for e in analysis['entities']]
|
205 |
-
return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
|
206 |
-
|
207 |
-
def _extract_list(self, soup, analysis):
|
208 |
-
"""Extract list items from page"""
|
209 |
-
entities = [e[0] for e in analysis['entities']]
|
210 |
-
items = []
|
211 |
-
for list_tag in soup.find_all(['ul', 'ol']):
|
212 |
-
list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
|
213 |
-
if any(e.lower() in ' '.join(list_items).lower() for e in entities):
|
214 |
-
items.extend(list_items)
|
215 |
-
return items if items else None
|
216 |
-
|
217 |
-
def _extract_general(self, text, analysis):
|
218 |
-
"""Extract general information from text"""
|
219 |
-
entities = [e[0] for e in analysis['entities']]
|
220 |
-
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
221 |
-
relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
|
222 |
-
return ' '.join(relevant) if relevant else None
|
223 |
-
|
224 |
-
def _calculate_confidence(self, answer, analysis):
|
225 |
-
"""Calculate confidence score for an answer"""
|
226 |
-
confidence = 0.5 # Base confidence
|
227 |
-
|
228 |
-
if analysis['intent'] == 'count' and isinstance(answer, int):
|
229 |
-
confidence += 0.3
|
230 |
-
elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
|
231 |
-
confidence += 0.3
|
232 |
-
elif analysis['intent'] == 'list' and isinstance(answer, list):
|
233 |
-
confidence += 0.3
|
234 |
-
|
235 |
-
if analysis['time_constraints'] and str(answer):
|
236 |
-
for constraint in analysis['time_constraints']:
|
237 |
-
if constraint[0] == 'range':
|
238 |
-
years = re.findall(r'\b(19|20)\d{2}\b', str(answer))
|
239 |
-
if any(constraint[1] <= int(y) <= constraint[2] for y in years):
|
240 |
-
confidence += 0.2
|
241 |
-
|
242 |
-
return min(0.99, max(0.1, confidence))
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
"
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
259 |
"""
|
|
|
6 |
from typing import List, Dict, Union
|
7 |
import requests
|
8 |
import wikipediaapi
|
9 |
+
import google.generativeai as genai
|
10 |
+
from typing import List, Dict, Union
|
11 |
import requests
|
12 |
+
import wikipediaapi
|
13 |
+
import pandas as pd
|
|
|
14 |
|
15 |
load_dotenv()
|
16 |
|
|
|
18 |
# --- Constants ---
|
19 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
20 |
|
21 |
+
|
22 |
# --- Basic Agent Definition ---
|
23 |
class BasicAgent:
|
24 |
+
def __init__(self, model="google/gemma-7b"):
|
25 |
+
self.api_url = f"https://api-inference.huggingface.co/models/{model}"
|
26 |
+
self.headers = {"Authorization": f"Bearer {os.getenv('HF_API_KEY')}"}
|
27 |
print("BasicAgent initialized.")
|
28 |
+
|
29 |
+
#usage
|
30 |
+
#agent = HuggingFaceAgent("google/gemma-7b") # Same architecture as Gemini
|
31 |
+
#print(agent.generate("Explain quantum computing"))
|
32 |
+
|
33 |
+
|
34 |
def __call__(self, question: str) -> str:
|
35 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
36 |
+
fixed_answer = self.agent.generate(question)
|
37 |
+
print(f"Agent returning answer: {fixed_answer}")
|
38 |
return fixed_answer
|
39 |
|
40 |
|
41 |
+
# to check
|
42 |
+
def generate_response(self, prompt: str) -> str:
|
43 |
+
"""Get response from Gema"""
|
44 |
+
try:
|
45 |
+
response = self.model.generate_content(prompt)
|
46 |
+
return response.text
|
47 |
+
except Exception as e:
|
48 |
+
return f"Error generating response: {str(e)}"
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
|
|
|
|
|
|
51 |
|
52 |
+
def web_search(self, query: str) -> List[Dict]:
|
53 |
+
"""Use SearxNG meta-search engine"""
|
|
|
54 |
params = {
|
55 |
+
"q": query,
|
56 |
+
"format": "json",
|
57 |
+
"engines": "google,bing,duckduckgo"
|
|
|
|
|
58 |
}
|
59 |
try:
|
60 |
+
response = requests.get(self.searx_url, params=params)
|
61 |
+
response.raise_for_status()
|
62 |
+
return response.json().get("results", [])
|
63 |
+
except requests.RequestException:
|
|
|
|
|
|
|
|
|
|
|
64 |
return []
|
65 |
|
66 |
+
def wikipedia_search(self, query: str) -> str:
|
67 |
+
"""Get Wikipedia summary"""
|
68 |
+
page = self.wiki.page(query)
|
69 |
+
return page.summary if page.exists() else "No Wikipedia page found"
|
70 |
+
|
71 |
+
def process_document(self, file_path: str) -> str:
|
72 |
+
"""Handle PDF, Word, CSV, Excel files"""
|
73 |
+
if not os.path.exists(file_path):
|
74 |
+
return "File not found"
|
75 |
+
|
76 |
+
ext = os.path.splitext(file_path)[1].lower()
|
77 |
|
78 |
try:
|
79 |
+
if ext == '.pdf':
|
80 |
+
return self._process_pdf(file_path)
|
81 |
+
elif ext in ('.doc', '.docx'):
|
82 |
+
return self._process_word(file_path)
|
83 |
+
elif ext == '.csv':
|
84 |
+
return pd.read_csv(file_path).to_string()
|
85 |
+
elif ext in ('.xls', '.xlsx'):
|
86 |
+
return pd.read_excel(file_path).to_string()
|
87 |
+
else:
|
88 |
+
return "Unsupported file format"
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
except Exception as e:
|
90 |
+
return f"Error processing document: {str(e)}"
|
|
|
91 |
|
92 |
+
def _process_pdf(self, file_path: str) -> str:
|
93 |
+
"""Process PDF using Gemini's vision capability"""
|
94 |
+
try:
|
95 |
+
# For Gemini 1.5 or later which supports file uploads
|
96 |
+
with open(file_path, "rb") as f:
|
97 |
+
file = genai.upload_file(f)
|
98 |
+
response = self.model.generate_content(
|
99 |
+
["Extract and summarize the key points from this document:", file]
|
100 |
+
)
|
101 |
+
return response.text
|
102 |
+
except:
|
103 |
+
# Fallback for older Gemini versions
|
104 |
+
try:
|
105 |
+
import PyPDF2
|
106 |
+
with open(file_path, 'rb') as f:
|
107 |
+
reader = PyPDF2.PdfReader(f)
|
108 |
+
return "\n".join([page.extract_text() for page in reader.pages])
|
109 |
+
except ImportError:
|
110 |
+
return "PDF processing requires PyPDF2 (pip install PyPDF2)"
|
111 |
+
|
112 |
+
def _process_word(self, file_path: str) -> str:
|
113 |
+
"""Process Word documents"""
|
114 |
+
try:
|
115 |
+
from docx import Document
|
116 |
+
doc = Document(file_path)
|
117 |
+
return "\n".join([para.text for para in doc.paragraphs])
|
118 |
+
except ImportError:
|
119 |
+
return "Word processing requires python-docx (pip install python-docx)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
def process_request(self, request: Union[str, Dict]) -> str:
|
122 |
+
"""
|
123 |
+
Handle different request types:
|
124 |
+
- Direct text queries
|
125 |
+
- File processing requests
|
126 |
+
- Complex multi-step requests
|
127 |
+
"""
|
128 |
+
if isinstance(request, dict):
|
129 |
+
if 'steps' in request:
|
130 |
+
results = []
|
131 |
+
for step in request['steps']:
|
132 |
+
if step['type'] == 'search':
|
133 |
+
results.append(self.web_search(step['query']))
|
134 |
+
elif step['type'] == 'process':
|
135 |
+
results.append(self.process_document(step['file']))
|
136 |
+
return self.generate_response(f"Process these results: {results}")
|
137 |
+
return "Unsupported request format"
|
138 |
+
|
139 |
+
return self.generate_response(request)
|
140 |
|
141 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
142 |
"""
|