Final_Assignment_Project

Restarting

App Files Files Community

wt002 commited on 5 days ago

Commit

4f47377

verified ·

1 Parent(s): 8289a44

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -60

app.py CHANGED Viewed

@@ -11,9 +11,6 @@ import requests
 from bs4 import BeautifulSoup
 import re
 from urllib.parse import quote
-import requests
-from urllib.parse import quote
-from googlesearch import search
 load_dotenv()
@@ -25,6 +22,11 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         fixed_answer = agent.answer_question({question})
@@ -32,36 +34,27 @@ class BasicAgent:
         return fixed_answer
-class BasicAgent:
-    def __init__(self):
-        self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        })
-        self.cache = {}
     def analyze_query(self, query):
-        """Simplified query analysis without spaCy"""
-        analysis = {
             'entities': self._extract_entities(query),
             'intent': self._determine_intent(query.lower()),
             'time_constraints': self._extract_time_constraints(query),
             'quantities': self._extract_quantities(query)
         }
-        return analysis
     def _extract_entities(self, text):
-        """Simple entity extraction using patterns"""
-        # Extract capitalized phrases (crude named entity recognition)
-        entities = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text)
-        return [(ent, 'UNKNOWN') for ent in entities if len(ent.split()) < 4]
     def _determine_intent(self, query):
-        """Determine intent using keyword matching"""
         if 'how many' in query:
             return 'count'
-        elif 'when' in query:
             return 'date'
         elif 'who' in query:
             return 'person'
@@ -72,12 +65,14 @@ class BasicAgent:
         return 'general'
     def _extract_time_constraints(self, text):
-        """Extract time ranges from text"""
         constraints = []
         range_match = re.search(r'between (\d{4}) and (\d{4})', text)
         if range_match:
             constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
         year_match = re.search(r'in (\d{4})', text)
         if year_match:
             constraints.append(('point', int(year_match.group(1))))
@@ -85,46 +80,33 @@ class BasicAgent:
         return constraints
     def _extract_quantities(self, text):
-        """Extract numerical quantities from text"""
         return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
-    def search_web(self, query, num_results=3):
-        """Search the web using Google and Wikipedia"""
-        results = []
-        # Google search
-        try:
-            results.extend({
-                'url': url,
-                'source': 'google'
-            } for url in search(query, num_results=num_results, stop=num_results))
-        except Exception as e:
-            print(f"Google search error: {e}")
-        # Wikipedia search
         try:
-            wiki_url = "https://en.wikipedia.org/w/api.php"
-            params = {
-                'action': 'query',
-                'list': 'search',
-                'srsearch': query,
-                'format': 'json',
-                'srlimit': num_results
-            }
-            response = self.session.get(wiki_url, params=params).json()
-            results.extend({
                 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
                 'title': item['title'],
                 'snippet': item['snippet'],
                 'source': 'wikipedia'
-            } for item in response['query']['search'])
         except Exception as e:
             print(f"Wikipedia search error: {e}")
-        return results[:num_results*2]
     def fetch_page(self, url):
-        """Fetch and parse a web page with caching"""
         if url in self.cache:
             return self.cache[url]
@@ -133,7 +115,7 @@ class BasicAgent:
             soup = BeautifulSoup(response.text, 'html.parser')
             # Clean the page content
-            for element in soup(['script', 'style', 'nav', 'footer']):
                 element.decompose()
             page_data = {
@@ -149,17 +131,18 @@ class BasicAgent:
             print(f"Error fetching {url}: {e}")
             return None
-    def answer_question(self, question, num_sources=3):
-        """Main method to answer a question"""
         print(f"\nQuestion: {question}")
         # Step 1: Analyze the question
         analysis = self.analyze_query(question)
         print(f"Analysis: {analysis}")
-        # Step 2: Search the web
-        search_results = self.search_web(question, num_sources)
-        print(f"Found {len(search_results)} potential sources")
         # Step 3: Fetch and analyze pages
         answers = []
@@ -176,7 +159,7 @@ class BasicAgent:
         # Step 4: Return the best answer
         if not answers:
-            return {"answer": "No answers found", "source": None}
         answers.sort(key=lambda x: x['confidence'], reverse=True)
         best_answer = answers[0]
@@ -210,7 +193,7 @@ class BasicAgent:
         entities = [e[0] for e in analysis['entities']]
         pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
         matches = re.finditer(pattern, text, re.IGNORECASE)
-        counts = [int(match.group(1)) for match in matches]
         return max(counts) if counts else None
     def _extract_date(self, text, analysis):
@@ -259,7 +242,7 @@ class BasicAgent:
 # Example usage
 if __name__ == "__main__":
-    agent = SimpleWebSearchAgent()
     questions = [
         "How many studio albums did Taylor Swift release between 2010 and 2015?",

 from bs4 import BeautifulSoup
 import re
 from urllib.parse import quote
 load_dotenv()
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        self.cache = {}
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         fixed_answer = agent.answer_question({question})
         return fixed_answer
     def analyze_query(self, query):
+        """Analyze the query using regex patterns"""
+        return {
             'entities': self._extract_entities(query),
             'intent': self._determine_intent(query.lower()),
             'time_constraints': self._extract_time_constraints(query),
             'quantities': self._extract_quantities(query)
         }
     def _extract_entities(self, text):
+        """Simple entity extraction using capitalization patterns"""
+        # Find proper nouns (capitalized phrases)
+        entities = re.findall(r'([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)', text)
+        # Filter out small words and standalone letters
+        return [(ent, 'UNKNOWN') for ent in entities if len(ent) > 2 and ' ' in ent]
     def _determine_intent(self, query):
+        """Determine intent using keyword patterns"""
         if 'how many' in query:
             return 'count'
+        elif 'when' in query or 'date' in query:
             return 'date'
         elif 'who' in query:
             return 'person'
         return 'general'
     def _extract_time_constraints(self, text):
+        """Extract year ranges from text"""
         constraints = []
+        # Match patterns like "between 2000 and 2009"
         range_match = re.search(r'between (\d{4}) and (\d{4})', text)
         if range_match:
             constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
+        # Match patterns like "in 2005"
         year_match = re.search(r'in (\d{4})', text)
         if year_match:
             constraints.append(('point', int(year_match.group(1))))
         return constraints
     def _extract_quantities(self, text):
+        """Extract numbers from text"""
         return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
+    def search_wikipedia(self, query, num_results=3):
+        """Search Wikipedia's API"""
+        url = "https://en.wikipedia.org/w/api.php"
+        params = {
+            'action': 'query',
+            'list': 'search',
+            'srsearch': query,
+            'format': 'json',
+            'srlimit': num_results
+        }
         try:
+            response = self.session.get(url, params=params).json()
+            return [{
                 'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
                 'title': item['title'],
                 'snippet': item['snippet'],
                 'source': 'wikipedia'
+            } for item in response['query']['search']]
         except Exception as e:
             print(f"Wikipedia search error: {e}")
+            return []
     def fetch_page(self, url):
+        """Fetch and parse a Wikipedia page"""
         if url in self.cache:
             return self.cache[url]
             soup = BeautifulSoup(response.text, 'html.parser')
             # Clean the page content
+            for element in soup(['script', 'style', 'nav', 'footer', 'table']):
                 element.decompose()
             page_data = {
             print(f"Error fetching {url}: {e}")
             return None
+    def answer_question(self, question):
+        """Answer a question using Wikipedia"""
         print(f"\nQuestion: {question}")
         # Step 1: Analyze the question
         analysis = self.analyze_query(question)
         print(f"Analysis: {analysis}")
+        # Step 2: Search Wikipedia
+        search_results = self.search_wikipedia(question)
+        if not search_results:
+            return {"answer": "No Wikipedia results found", "source": None}
         # Step 3: Fetch and analyze pages
         answers = []
         # Step 4: Return the best answer
         if not answers:
+            return {"answer": "No answers found in Wikipedia", "source": None}
         answers.sort(key=lambda x: x['confidence'], reverse=True)
         best_answer = answers[0]
         entities = [e[0] for e in analysis['entities']]
         pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
         matches = re.finditer(pattern, text, re.IGNORECASE)
+        counts = [int(match.group(1))) for match in matches]
         return max(counts) if counts else None
     def _extract_date(self, text, analysis):
 # Example usage
 if __name__ == "__main__":
+    agent = BasicAgent()
     questions = [
         "How many studio albums did Taylor Swift release between 2010 and 2015?",