Final_Assignment_Project

Running

App Files Files Community

wt002 commited on 5 days ago

Commit

f71d65e

verified ·

1 Parent(s): 1619cab

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -117

app.py CHANGED Viewed

@@ -11,7 +11,8 @@ import requests
 from bs4 import BeautifulSoup
 import re
 from urllib.parse import quote
-import spacy
 from googlesearch import search
 load_dotenv()
@@ -26,13 +27,14 @@ class BasicAgent:
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = WebSearchAgent.run({question})
         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
-class WebSearchAgent:
     def __init__(self):
-        self.nlp = spacy.load("en_core_web_sm")
         self.session = requests.Session()
         self.session.headers.update({
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
@@ -40,19 +42,23 @@ class WebSearchAgent:
         self.cache = {}
     def analyze_query(self, query):
-        """Analyze the query to determine intent and extract entities"""
-        doc = self.nlp(query)
         analysis = {
-            'entities': [(ent.text, ent.label_) for ent in doc.ents],
             'intent': self._determine_intent(query.lower()),
             'time_constraints': self._extract_time_constraints(query),
             'quantities': self._extract_quantities(query)
         }
         return analysis
     def _determine_intent(self, query):
-        """Determine the intent of the query"""
         if 'how many' in query:
             return 'count'
         elif 'when' in query:
@@ -68,12 +74,10 @@ class WebSearchAgent:
     def _extract_time_constraints(self, text):
         """Extract time ranges from text"""
         constraints = []
-        # Match patterns like "between 2000 and 2009"
         range_match = re.search(r'between (\d{4}) and (\d{4})', text)
         if range_match:
             constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
-        # Match patterns like "in 2005"
         year_match = re.search(r'in (\d{4})', text)
         if year_match:
             constraints.append(('point', int(year_match.group(1))))
@@ -85,45 +89,39 @@ class WebSearchAgent:
         return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
     def search_web(self, query, num_results=3):
-        """Search the web using multiple sources"""
-        sources = {
-            'wikipedia': self._search_wikipedia,
-            'google': self._search_google
-        }
         results = []
-        for source_name, search_func in sources.items():
-            try:
-                results.extend(search_func(query, num_results))
-            except Exception as e:
-                print(f"Error searching {source_name}: {e}")
-        return results[:num_results*2]  # Return max of double the requested results
-    def _search_wikipedia(self, query, num_results):
-        """Search Wikipedia API"""
-        url = "https://en.wikipedia.org/w/api.php"
-        params = {
-            'action': 'query',
-            'list': 'search',
-            'srsearch': query,
-            'format': 'json',
-            'srlimit': num_results
-        }
-        response = self.session.get(url, params=params).json()
-        return [{
-            'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
-            'title': item['title'],
-            'snippet': item['snippet'],
-            'source': 'wikipedia'
-        } for item in response['query']['search']]
-    def _search_google(self, query, num_results):
-        """Search Google using python-googlesearch"""
-        return [{
-            'url': url,
-            'source': 'google'
-        } for url in search(query, num_results=num_results, stop=num_results)]
     def fetch_page(self, url):
         """Fetch and parse a web page with caching"""
@@ -151,11 +149,53 @@ class WebSearchAgent:
             print(f"Error fetching {url}: {e}")
             return None
-    def extract_answer(self, page, analysis):
-        """Extract relevant information from a page based on query analysis"""
-        if not page:
-            return None
         if analysis['intent'] == 'count':
             return self._extract_count(page['text'], analysis)
         elif analysis['intent'] == 'date':
@@ -170,15 +210,13 @@ class WebSearchAgent:
         entities = [e[0] for e in analysis['entities']]
         pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
         matches = re.finditer(pattern, text, re.IGNORECASE)
-        counts = [int(match.group(1)) for match in matches]
         return max(counts) if counts else None
     def _extract_date(self, text, analysis):
         """Extract dates from text"""
         date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
         dates = [match.group(0) for match in re.finditer(date_pattern, text)]
         entities = [e[0] for e in analysis['entities']]
         return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
@@ -186,65 +224,23 @@ class WebSearchAgent:
         """Extract list items from page"""
         entities = [e[0] for e in analysis['entities']]
         items = []
         for list_tag in soup.find_all(['ul', 'ol']):
             list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
             if any(e.lower() in ' '.join(list_items).lower() for e in entities):
                 items.extend(list_items)
         return items if items else None
     def _extract_general(self, text, analysis):
         """Extract general information from text"""
         entities = [e[0] for e in analysis['entities']]
         sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
         relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
         return ' '.join(relevant) if relevant else None
-    def answer_question(self, question, num_sources=3):
-        """Main method to answer a question"""
-        print(f"Processing question: {question}")
-        # Step 1: Analyze the question
-        analysis = self.analyze_query(question)
-        print(f"Analysis: {analysis}")
-        # Step 2: Search the web
-        search_results = self.search_web(question, num_sources)
-        print(f"Found {len(search_results)} potential sources")
-        # Step 3: Fetch and analyze pages
-        answers = []
-        for result in search_results:
-            page = self.fetch_page(result['url'])
-            if page:
-                answer = self.extract_answer(page, analysis)
-                if answer:
-                    answers.append({
-                        'answer': answer,
-                        'source': result['url'],
-                        'confidence': self._calculate_confidence(answer, analysis)
-                    })
-        # Step 4: Return the best answer
-        if not answers:
-            return {"status": "No answers found"}
-        answers.sort(key=lambda x: x['confidence'], reverse=True)
-        return {
-            "question": question,
-            "best_answer": answers[0]['answer'],
-            "source": answers[0]['source'],
-            "confidence": answers[0]['confidence'],
-            "all_answers": answers
-        }
     def _calculate_confidence(self, answer, analysis):
         """Calculate confidence score for an answer"""
         confidence = 0.5  # Base confidence
-        # Type matching
         if analysis['intent'] == 'count' and isinstance(answer, int):
             confidence += 0.3
         elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
@@ -252,7 +248,6 @@ class WebSearchAgent:
         elif analysis['intent'] == 'list' and isinstance(answer, list):
             confidence += 0.3
-        # Time constraints
         if analysis['time_constraints'] and str(answer):
             for constraint in analysis['time_constraints']:
                 if constraint[0] == 'range':
@@ -260,33 +255,25 @@ class WebSearchAgent:
                     if any(constraint[1] <= int(y) <= constraint[2] for y in years):
                         confidence += 0.2
-        return min(0.99, max(0.1, confidence))  # Keep within bounds
-# Example usage
 if __name__ == "__main__":
-    agent = WebSearchAgent()
     questions = [
-        "How many studio albums were published by Taylor Swift between 2010 and 2015?",
-        "When was Albert Einstein born?",
-        "What is the capital of Australia?",
-        "List the members of The Beatles"
     ]
     for question in questions:
-        print("\n" + "="*50)
-        print(f"Question: {question}")
         result = agent.answer_question(question)
-        print("\nBest Answer:")
-        if isinstance(result['best_answer'], list):
-            for item in result['best_answer']:
-                print(f"- {item}")
-        else:
-            print(result['best_answer'])
-        print(f"\nSource: {result['source']}")
-        print(f"Confidence: {result['confidence']:.0%}")

 from bs4 import BeautifulSoup
 import re
 from urllib.parse import quote
+import requests
+from urllib.parse import quote
 from googlesearch import search
 load_dotenv()
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        fixed_answer = agent.answer_question({question})
         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
+class BasicAgent:
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         self.cache = {}
     def analyze_query(self, query):
+        """Simplified query analysis without spaCy"""
         analysis = {
+            'entities': self._extract_entities(query),
             'intent': self._determine_intent(query.lower()),
             'time_constraints': self._extract_time_constraints(query),
             'quantities': self._extract_quantities(query)
         }
         return analysis
+    def _extract_entities(self, text):
+        """Simple entity extraction using patterns"""
+        # Extract capitalized phrases (crude named entity recognition)
+        entities = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text)
+        return [(ent, 'UNKNOWN') for ent in entities if len(ent.split()) < 4]
     def _determine_intent(self, query):
+        """Determine intent using keyword matching"""
         if 'how many' in query:
             return 'count'
         elif 'when' in query:
     def _extract_time_constraints(self, text):
         """Extract time ranges from text"""
         constraints = []
         range_match = re.search(r'between (\d{4}) and (\d{4})', text)
         if range_match:
             constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
         year_match = re.search(r'in (\d{4})', text)
         if year_match:
             constraints.append(('point', int(year_match.group(1))))
         return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
     def search_web(self, query, num_results=3):
+        """Search the web using Google and Wikipedia"""
         results = []
+        # Google search
+        try:
+            results.extend({
+                'url': url,
+                'source': 'google'
+            } for url in search(query, num_results=num_results, stop=num_results))
+        except Exception as e:
+            print(f"Google search error: {e}")
+        # Wikipedia search
+        try:
+            wiki_url = "https://en.wikipedia.org/w/api.php"
+            params = {
+                'action': 'query',
+                'list': 'search',
+                'srsearch': query,
+                'format': 'json',
+                'srlimit': num_results
+            }
+            response = self.session.get(wiki_url, params=params).json()
+            results.extend({
+                'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
+                'title': item['title'],
+                'snippet': item['snippet'],
+                'source': 'wikipedia'
+            } for item in response['query']['search'])
+        except Exception as e:
+            print(f"Wikipedia search error: {e}")
+        return results[:num_results*2]
     def fetch_page(self, url):
         """Fetch and parse a web page with caching"""
             print(f"Error fetching {url}: {e}")
             return None
+    def answer_question(self, question, num_sources=3):
+        """Main method to answer a question"""
+        print(f"\nQuestion: {question}")
+        # Step 1: Analyze the question
+        analysis = self.analyze_query(question)
+        print(f"Analysis: {analysis}")
+        # Step 2: Search the web
+        search_results = self.search_web(question, num_sources)
+        print(f"Found {len(search_results)} potential sources")
+        # Step 3: Fetch and analyze pages
+        answers = []
+        for result in search_results:
+            page = self.fetch_page(result['url'])
+            if page:
+                answer = self._extract_answer(page, analysis)
+                if answer:
+                    answers.append({
+                        'answer': answer,
+                        'source': result['url'],
+                        'confidence': self._calculate_confidence(answer, analysis)
+                    })
+        # Step 4: Return the best answer
+        if not answers:
+            return {"answer": "No answers found", "source": None}
+        answers.sort(key=lambda x: x['confidence'], reverse=True)
+        best_answer = answers[0]
+        # Format the output
+        result = {
+            "question": question,
+            "answer": best_answer['answer'],
+            "source": best_answer['source'],
+            "confidence": f"{best_answer['confidence']:.0%}"
+        }
+        if isinstance(best_answer['answer'], list):
+            result['answer'] = "\n- " + "\n- ".join(best_answer['answer'])
+        return result
+    def _extract_answer(self, page, analysis):
+        """Extract answer based on intent"""
         if analysis['intent'] == 'count':
             return self._extract_count(page['text'], analysis)
         elif analysis['intent'] == 'date':
         entities = [e[0] for e in analysis['entities']]
         pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
         matches = re.finditer(pattern, text, re.IGNORECASE)
+        counts = [int(match.group(1))) for match in matches]
         return max(counts) if counts else None
     def _extract_date(self, text, analysis):
         """Extract dates from text"""
         date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
         dates = [match.group(0) for match in re.finditer(date_pattern, text)]
         entities = [e[0] for e in analysis['entities']]
         return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
         """Extract list items from page"""
         entities = [e[0] for e in analysis['entities']]
         items = []
         for list_tag in soup.find_all(['ul', 'ol']):
             list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
             if any(e.lower() in ' '.join(list_items).lower() for e in entities):
                 items.extend(list_items)
         return items if items else None
     def _extract_general(self, text, analysis):
         """Extract general information from text"""
         entities = [e[0] for e in analysis['entities']]
         sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
         relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
         return ' '.join(relevant) if relevant else None
     def _calculate_confidence(self, answer, analysis):
         """Calculate confidence score for an answer"""
         confidence = 0.5  # Base confidence
         if analysis['intent'] == 'count' and isinstance(answer, int):
             confidence += 0.3
         elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
         elif analysis['intent'] == 'list' and isinstance(answer, list):
             confidence += 0.3
         if analysis['time_constraints'] and str(answer):
             for constraint in analysis['time_constraints']:
                 if constraint[0] == 'range':
                     if any(constraint[1] <= int(y) <= constraint[2] for y in years):
                         confidence += 0.2
+        return min(0.99, max(0.1, confidence))
+# Example usage
 if __name__ == "__main__":
+    agent = SimpleWebSearchAgent()
     questions = [
+        "How many studio albums did Taylor Swift release between 2010 and 2015?",
+        "When was the first iPhone released?",
+        "What is the capital of Canada?",
+        "List the planets in our solar system"
     ]
     for question in questions:
         result = agent.answer_question(question)
+        print(f"\nAnswer: {result['answer']}")
+        #print(f"Source: {result['source']}")
+        #print(f"Confidence: {result['confidence']}")
+        #print("="*50)