Final_Assignment_Project

Running

App Files Files Community

wt002 commited on 5 days ago

Commit

256b6ef

verified ·

1 Parent(s): 00a9519

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -72

app.py CHANGED Viewed

@@ -7,7 +7,12 @@ from typing import List, Dict, Union
 import requests
 import wikipediaapi
 import pandas as pd
-from duckduckgo_search import DDGS
 load_dotenv()
@@ -15,86 +20,275 @@ load_dotenv()
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# Custom search tool class
-class CustomDuckDuckGoSearchTool:
-    def __call__(self, query: str, max_results: int = 5):
-        try:
-            with DDGS() as ddgs:
-                results = []
-                for r in ddgs.text(query):
-                    results.append(r)
-                    if len(results) >= max_results:
-                        break
-            return results
-        except Exception as e:
-            return f"Search error: {str(e)}"
-# Dummy placeholder for `visit_webpage` tool
-class VisitWebpageTool:
-    def __call__(self, url: str):
-        return f"Pretending to visit: {url}"
-# Final answer tool to format and return the final response
-class FinalAnswerTool:
-    def __call__(self, results):
-        formatted_answer = "Final Answer:\n"
-        for result in results:
-            formatted_answer += f"- {str(result)}\n"
-        return formatted_answer
-# Dummy model
-class DummyModel:
-    def call(self, input_text):
-        return f"Model processing: {input_text}"
-# Modified ToolCallingAgent to use FinalAnswerTool
-class ToolCallingAgent:
-    def __init__(self, tools, model, final_answer_tool, max_steps=10):
-        self.tools = tools
-        self.model = model
-        self.final_answer_tool = final_answer_tool
-        self.max_steps = max_steps
-    def run(self, query):
-        print(f"Running agent with query: {query}")
-        tool_outputs = []
-        for tool in self.tools:
-            output = tool(query)
-            print("Tool output:", output)
-            tool_outputs.append(output)
-        # Use the final answer tool to format the collected outputs
-        final_result = self.final_answer_tool(tool_outputs)
-        print(final_result)
-        return final_result
-# Initialize tools and model
-model = DummyModel()
-search_tool = CustomDuckDuckGoSearchTool()
-visit_webpage = VisitWebpageTool()
-final_answer = FinalAnswerTool()
-# Initialize the agent
-web_agent = ToolCallingAgent(
-    tools=[search_tool, visit_webpage],
-    model="google/gemma-7b",
-    final_answer_tool=final_answer,
-    max_steps=10
-)
-# Example usage
-#web_agent.run("Latest AI tools")
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = web_agent.run({question})
         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

 import requests
 import wikipediaapi
 import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+import re
+from urllib.parse import quote
+import spacy
+from googlesearch import search
 load_dotenv()
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        fixed_answer = WebSearchAgent.run({question})
         print(f"Agent returning fixed answer: {fixed_answer}")
         return fixed_answer
+class WebSearchAgent:
+    def __init__(self):
+        self.nlp = spacy.load("en_core_web_sm")
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        self.cache = {}
+    def analyze_query(self, query):
+        """Analyze the query to determine intent and extract entities"""
+        doc = self.nlp(query)
+        analysis = {
+            'entities': [(ent.text, ent.label_) for ent in doc.ents],
+            'intent': self._determine_intent(query.lower()),
+            'time_constraints': self._extract_time_constraints(query),
+            'quantities': self._extract_quantities(query)
+        }
+        return analysis
+    def _determine_intent(self, query):
+        """Determine the intent of the query"""
+        if 'how many' in query:
+            return 'count'
+        elif 'when' in query:
+            return 'date'
+        elif 'who' in query:
+            return 'person'
+        elif 'what is' in query or 'define' in query:
+            return 'definition'
+        elif 'list' in query or 'name all' in query:
+            return 'list'
+        return 'general'
+    def _extract_time_constraints(self, text):
+        """Extract time ranges from text"""
+        constraints = []
+        # Match patterns like "between 2000 and 2009"
+        range_match = re.search(r'between (\d{4}) and (\d{4})', text)
+        if range_match:
+            constraints.append(('range', int(range_match.group(1)), int(range_match.group(2))))
+        # Match patterns like "in 2005"
+        year_match = re.search(r'in (\d{4})', text)
+        if year_match:
+            constraints.append(('point', int(year_match.group(1))))
+        return constraints
+    def _extract_quantities(self, text):
+        """Extract numerical quantities from text"""
+        return [int(match) for match in re.findall(r'\b(\d+)\b', text)]
+    def search_web(self, query, num_results=3):
+        """Search the web using multiple sources"""
+        sources = {
+            'wikipedia': self._search_wikipedia,
+            'google': self._search_google
+        }
+        results = []
+        for source_name, search_func in sources.items():
+            try:
+                results.extend(search_func(query, num_results))
+            except Exception as e:
+                print(f"Error searching {source_name}: {e}")
+        return results[:num_results*2]  # Return max of double the requested results
+    def _search_wikipedia(self, query, num_results):
+        """Search Wikipedia API"""
+        url = "https://en.wikipedia.org/w/api.php"
+        params = {
+            'action': 'query',
+            'list': 'search',
+            'srsearch': query,
+            'format': 'json',
+            'srlimit': num_results
+        }
+        response = self.session.get(url, params=params).json()
+        return [{
+            'url': f"https://en.wikipedia.org/wiki/{item['title'].replace(' ', '_')}",
+            'title': item['title'],
+            'snippet': item['snippet'],
+            'source': 'wikipedia'
+        } for item in response['query']['search']]
+    def _search_google(self, query, num_results):
+        """Search Google using python-googlesearch"""
+        return [{
+            'url': url,
+            'source': 'google'
+        } for url in search(query, num_results=num_results, stop=num_results)]
+    def fetch_page(self, url):
+        """Fetch and parse a web page with caching"""
+        if url in self.cache:
+            return self.cache[url]
+        try:
+            response = self.session.get(url, timeout=10)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Clean the page content
+            for element in soup(['script', 'style', 'nav', 'footer']):
+                element.decompose()
+            page_data = {
+                'url': url,
+                'title': soup.title.string if soup.title else '',
+                'text': ' '.join(soup.stripped_strings),
+                'soup': soup
+            }
+            self.cache[url] = page_data
+            return page_data
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+            return None
+    def extract_answer(self, page, analysis):
+        """Extract relevant information from a page based on query analysis"""
+        if not page:
+            return None
+        if analysis['intent'] == 'count':
+            return self._extract_count(page['text'], analysis)
+        elif analysis['intent'] == 'date':
+            return self._extract_date(page['text'], analysis)
+        elif analysis['intent'] == 'list':
+            return self._extract_list(page['soup'], analysis)
+        else:
+            return self._extract_general(page['text'], analysis)
+    def _extract_count(self, text, analysis):
+        """Extract a count/number from text"""
+        entities = [e[0] for e in analysis['entities']]
+        pattern = r'(\b\d+\b)[^\.]*\b(' + '|'.join(re.escape(e) for e in entities) + r')\b'
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        counts = [int(match.group(1))) for match in matches]
+        return max(counts) if counts else None
+    def _extract_date(self, text, analysis):
+        """Extract dates from text"""
+        date_pattern = r'\b(\d{1,2}(?:st|nd|rd|th)?\s+(?:\w+)\s+\d{4}|\d{4})\b'
+        dates = [match.group(0) for match in re.finditer(date_pattern, text)]
+        entities = [e[0] for e in analysis['entities']]
+        return next((d for d in dates if any(e.lower() in text.lower() for e in entities)), None)
+    def _extract_list(self, soup, analysis):
+        """Extract list items from page"""
+        entities = [e[0] for e in analysis['entities']]
+        items = []
+        for list_tag in soup.find_all(['ul', 'ol']):
+            list_items = [li.get_text().strip() for li in list_tag.find_all('li')]
+            if any(e.lower() in ' '.join(list_items).lower() for e in entities):
+                items.extend(list_items)
+        return items if items else None
+    def _extract_general(self, text, analysis):
+        """Extract general information from text"""
+        entities = [e[0] for e in analysis['entities']]
+        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+        relevant = [s for s in sentences if any(e.lower() in s.lower() for e in entities)]
+        return ' '.join(relevant) if relevant else None
+    def answer_question(self, question, num_sources=3):
+        """Main method to answer a question"""
+        print(f"Processing question: {question}")
+        # Step 1: Analyze the question
+        analysis = self.analyze_query(question)
+        print(f"Analysis: {analysis}")
+        # Step 2: Search the web
+        search_results = self.search_web(question, num_sources)
+        print(f"Found {len(search_results)} potential sources")
+        # Step 3: Fetch and analyze pages
+        answers = []
+        for result in search_results:
+            page = self.fetch_page(result['url'])
+            if page:
+                answer = self.extract_answer(page, analysis)
+                if answer:
+                    answers.append({
+                        'answer': answer,
+                        'source': result['url'],
+                        'confidence': self._calculate_confidence(answer, analysis)
+                    })
+        # Step 4: Return the best answer
+        if not answers:
+            return {"status": "No answers found"}
+        answers.sort(key=lambda x: x['confidence'], reverse=True)
+        return {
+            "question": question,
+            "best_answer": answers[0]['answer'],
+            "source": answers[0]['source'],
+            "confidence": answers[0]['confidence'],
+            "all_answers": answers
+        }
+    def _calculate_confidence(self, answer, analysis):
+        """Calculate confidence score for an answer"""
+        confidence = 0.5  # Base confidence
+        # Type matching
+        if analysis['intent'] == 'count' and isinstance(answer, int):
+            confidence += 0.3
+        elif analysis['intent'] == 'date' and re.match(r'.*\d{4}.*', str(answer)):
+            confidence += 0.3
+        elif analysis['intent'] == 'list' and isinstance(answer, list):
+            confidence += 0.3
+        # Time constraints
+        if analysis['time_constraints'] and str(answer):
+            for constraint in analysis['time_constraints']:
+                if constraint[0] == 'range':
+                    years = re.findall(r'\b(19|20)\d{2}\b', str(answer))
+                    if any(constraint[1] <= int(y) <= constraint[2] for y in years):
+                        confidence += 0.2
+        return min(0.99, max(0.1, confidence))  # Keep within bounds
+# Example usage
+if __name__ == "__main__":
+    agent = WebSearchAgent()
+    questions = [
+        "How many studio albums were published by Taylor Swift between 2010 and 2015?",
+        "When was Albert Einstein born?",
+        "What is the capital of Australia?",
+        "List the members of The Beatles"
+    ]
+    for question in questions:
+        print("\n" + "="*50)
+        print(f"Question: {question}")
+        result = agent.answer_question(question)
+        print("\nBest Answer:")
+        if isinstance(result['best_answer'], list):
+            for item in result['best_answer']:
+                print(f"- {item}")
+        else:
+            print(result['best_answer'])
+        print(f"\nSource: {result['source']}")
+        print(f"Confidence: {result['confidence']:.0%}")
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """