Toumaima commited on
Commit
704ac65
·
verified ·
1 Parent(s): 0897129

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -42
app.py CHANGED
@@ -9,7 +9,7 @@ from duckduckgo_search import DDGS
9
  from transformers import pipeline
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  import numpy as np
12
- import wikipedia
13
 
14
  # --- Constants ---
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -18,82 +18,86 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
  class BasicAgent:
19
  def __init__(self):
20
  print("BasicAgent initialized.")
21
- # Initialize the Whisper model for video transcription
22
  self.whisper_model = whisper.load_model("base") # You can change the model to `large`, `medium`, etc.
23
  self.search_pipeline = pipeline("question-answering")
24
  self.nlp_model = pipeline("feature-extraction") # For semantic similarity (using transformer model)
 
25
 
26
- def score_search_results(self, question: str, search_results: list) -> str:
27
- # Transform the question and results to embeddings (vector representations)
28
- question_embedding = self.nlp_model(question)
29
- question_embedding = np.mean(question_embedding[0], axis=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
31
  best_score = -1
32
  best_answer = None
33
-
34
- # Loop through search results and calculate similarity
35
  for result in search_results:
36
- result_embedding = self.nlp_model(result['body'])
37
- result_embedding = np.mean(result_embedding[0], axis=0)
38
-
39
- # Calculate cosine similarity
40
- similarity = cosine_similarity([question_embedding], [result_embedding])
41
-
42
- # Check if this result is better
43
  if similarity > best_score:
44
  best_score = similarity
45
  best_answer = result['body']
46
-
47
- return best_answer
48
 
49
  def search(self, question: str) -> str:
50
- # Try Wikipedia first for reliable context
51
- try:
52
- wiki_titles = wikipedia.search(question)
53
- if wiki_titles:
54
- page = wikipedia.page(wiki_titles[0])
55
- wiki_content = page.content[:4000] # Truncate to 4000 chars for the QA model
56
- result = self.search_pipeline(question=question, context=wiki_content)
57
- return result["answer"]
58
- except Exception as e:
59
- print(f"Wikipedia lookup failed: {e}")
60
  try:
61
  with DDGS() as ddgs:
62
- results = list(ddgs.text(question, max_results=3)) # Fetch top 3 results
63
- if results:
64
- # Score all the results and return the best one
65
- return self.score_search_results(question, results)
66
- else:
67
  return "No relevant search results found."
 
 
 
 
 
 
 
 
 
68
  except Exception as e:
69
  return f"Search error: {e}"
70
 
71
  def call_whisper(self, video_path: str) -> str:
72
- # Transcribe the video to text using Whisper model
73
  video = moviepy.editor.VideoFileClip(video_path)
74
  audio_path = "temp_audio.wav"
75
  video.audio.write_audiofile(audio_path)
76
-
77
- # Transcribe audio to text
78
  result = self.whisper_model.transcribe(audio_path)
79
  return result["text"]
80
 
81
  def __call__(self, question: str, video_path: str = None) -> str:
82
  print(f"Agent received question (first 50 chars): {question[:50]}...")
83
-
84
- # If a video path is provided, use Whisper to transcribe the video
85
  if video_path:
86
  transcription = self.call_whisper(video_path)
87
- print(f"Transcribed video text: {transcription[:100]}...") # Print first 100 characters
88
  return transcription
89
 
90
- # If no video is provided, search the web for an answer
91
- search_answer = self.search(question)
92
- print(f"Agent returning search result: {search_answer[:100]}...")
93
  time.sleep(2)
94
- return search_answer
95
 
96
 
 
97
  def run_and_submit_all(profile: gr.OAuthProfile | None):
98
  """
99
  Fetches all questions, runs the BasicAgent on them, submits all answers,
 
9
  from transformers import pipeline
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  import numpy as np
12
+ from bs4 import BeautifulSoup
13
 
14
  # --- Constants ---
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
18
  class BasicAgent:
19
  def __init__(self):
20
  print("BasicAgent initialized.")
21
+ # Initialize Whisper model for video transcription
22
  self.whisper_model = whisper.load_model("base") # You can change the model to `large`, `medium`, etc.
23
  self.search_pipeline = pipeline("question-answering")
24
  self.nlp_model = pipeline("feature-extraction") # For semantic similarity (using transformer model)
25
+ self.ner_pipeline = pipeline("ner", grouped_entities=True)
26
 
27
+ def extract_person_entities(self, text: str) -> list:
28
+ # Extract named entities (persons) from the text
29
+ entities = self.ner_pipeline(text[:1000])
30
+ return [e['word'] for e in entities if e['entity_group'] == 'PER']
31
+
32
+ def extract_wikipedia_nominator(self, search_results: list) -> str:
33
+ # Check if search result contains Wikipedia nomination info
34
+ for result in search_results:
35
+ if "Wikipedia:Featured_article_candidates" in result.get('href', ''):
36
+ try:
37
+ response = requests.get(result['href'], timeout=10)
38
+ soup = BeautifulSoup(response.text, 'html.parser')
39
+ text = soup.get_text()
40
+ for line in text.split("\n"):
41
+ if "nominated by" in line.lower():
42
+ persons = self.extract_person_entities(line)
43
+ return f"Nominated by {persons[0]}" if persons else line.strip()
44
+ except Exception:
45
+ continue
46
+ return None
47
 
48
+ def score_search_results(self, question: str, search_results: list) -> str:
49
+ # Calculate semantic similarity and score the search results
50
+ question_embedding = np.mean(self.nlp_model(question)[0], axis=0)
51
  best_score = -1
52
  best_answer = None
 
 
53
  for result in search_results:
54
+ result_embedding = np.mean(self.nlp_model(result['body'])[0], axis=0)
55
+ similarity = cosine_similarity([question_embedding], [result_embedding])[0][0]
 
 
 
 
 
56
  if similarity > best_score:
57
  best_score = similarity
58
  best_answer = result['body']
59
+ return best_answer or "No high-confidence answer found."
 
60
 
61
  def search(self, question: str) -> str:
 
 
 
 
 
 
 
 
 
 
62
  try:
63
  with DDGS() as ddgs:
64
+ results = list(ddgs.text(question, max_results=5)) # Fetch top 5 results
65
+ if not results:
 
 
 
66
  return "No relevant search results found."
67
+
68
+ # If the question relates to Wikipedia Featured Article nomination, check for nomination
69
+ if "featured article" in question.lower() and "wikipedia" in question.lower():
70
+ nomination_info = self.extract_wikipedia_nominator(results)
71
+ if nomination_info:
72
+ return nomination_info
73
+
74
+ # Otherwise, return the best search result based on semantic similarity
75
+ return self.score_search_results(question, results)
76
  except Exception as e:
77
  return f"Search error: {e}"
78
 
79
  def call_whisper(self, video_path: str) -> str:
80
+ # Transcribe video using Whisper
81
  video = moviepy.editor.VideoFileClip(video_path)
82
  audio_path = "temp_audio.wav"
83
  video.audio.write_audiofile(audio_path)
 
 
84
  result = self.whisper_model.transcribe(audio_path)
85
  return result["text"]
86
 
87
  def __call__(self, question: str, video_path: str = None) -> str:
88
  print(f"Agent received question (first 50 chars): {question[:50]}...")
 
 
89
  if video_path:
90
  transcription = self.call_whisper(video_path)
91
+ print(f"Transcribed video text: {transcription[:100]}...")
92
  return transcription
93
 
94
+ answer = self.search(question)
95
+ print(f"Agent returning search result: {answer[:100]}...")
 
96
  time.sleep(2)
97
+ return answer
98
 
99
 
100
+ # --- Run and Submit All ---
101
  def run_and_submit_all(profile: gr.OAuthProfile | None):
102
  """
103
  Fetches all questions, runs the BasicAgent on them, submits all answers,