Spaces:
Sleeping
Sleeping
Update sherlock2.py
Browse files- sherlock2.py +101 -64
sherlock2.py
CHANGED
@@ -3,6 +3,7 @@ import google.ai.generativelanguage as glm
|
|
3 |
import streamlit as st
|
4 |
from bs4 import BeautifulSoup
|
5 |
import wikipedia
|
|
|
6 |
import os
|
7 |
import re
|
8 |
import requests
|
@@ -83,74 +84,110 @@ def search_and_scrape_wikipedia(keywords, max_topics_per_query=3, react_model='g
|
|
83 |
- "additional_sources": (Optional) A list of additional source URLs extracted from citations.
|
84 |
"""
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
**Thought 1:** I need to search Wikipedia for information related to "{question}".
|
91 |
-
|
92 |
-
**Action 1:**
|
93 |
-
|
94 |
-
**Observation 1:** {observation} # This will be filled in during the process
|
95 |
-
|
96 |
-
# ... (Further Thought-Action-Observation steps as needed)
|
97 |
-
|
98 |
-
**Action N:** # The final answer will be extracted from here
|
99 |
"""
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
wikipedia_info = []
|
103 |
-
react_model = genai.GenerativeModel(react_model) # Initialize the generative model
|
104 |
-
|
105 |
for query in keywords:
|
106 |
-
#
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
if action == "finish":
|
130 |
-
answer = observation.strip() # Extract the final answer
|
131 |
-
break # Exit the loop when is encountered
|
132 |
-
|
133 |
-
# Get Wikipedia page and URL (modified line)
|
134 |
-
page = wikipedia.page(search_term, auto_suggest=False)
|
135 |
-
url = page.url
|
136 |
-
|
137 |
-
additional_sources = []
|
138 |
-
if response.candidates[0].citation_metadata:
|
139 |
-
additional_sources = [source.url for source in response.candidates[0].citation_metadata.citation_sources]
|
140 |
-
|
141 |
-
wikipedia_info.append({
|
142 |
-
"topic": search_term,
|
143 |
-
"summary": answer, # Use the extracted answer as the summary
|
144 |
-
"url": url,
|
145 |
-
"additional_sources": additional_sources
|
146 |
-
})
|
147 |
-
|
148 |
-
except wikipedia.exceptions.DisambiguationError:
|
149 |
-
print(f"Ambiguous results for '{search_term}' (originally for '{query}'), skipping.")
|
150 |
-
except wikipedia.exceptions.PageError:
|
151 |
-
print(f"No Wikipedia page found for '{search_term}', skipping.")
|
152 |
-
except Exception as e:
|
153 |
-
st.error(f"Error searching Wikipedia: {e}")
|
154 |
|
155 |
return wikipedia_info
|
156 |
|
|
|
3 |
import streamlit as st
|
4 |
from bs4 import BeautifulSoup
|
5 |
import wikipedia
|
6 |
+
from wikipedia.exceptions import DisambiguationError, PageError
|
7 |
import os
|
8 |
import re
|
9 |
import requests
|
|
|
84 |
- "additional_sources": (Optional) A list of additional source URLs extracted from citations.
|
85 |
"""
|
86 |
|
87 |
+
model_instructions = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, Observation is understanding relevant information from an Action's output and Action can be of three types:
|
88 |
+
(1) , which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search and you can try to search the information from those topics.
|
89 |
+
(2) , which returns the next sentence containing keyword in the current context. This only does exact matches, so keep your searches short.
|
90 |
+
(3) , which returns the answer and finishes the task.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"""
|
92 |
|
93 |
+
# Define tools for ReAct (search, lookup, finish)
|
94 |
+
class ReAct:
|
95 |
+
def __init__(self, model: str, react_prompt: str):
|
96 |
+
self.model = genai.GenerativeModel(model)
|
97 |
+
self.chat = self.model.start_chat(history=[])
|
98 |
+
self.should_continue_prompting = True
|
99 |
+
self._search_history: list[str] = []
|
100 |
+
self._search_urls: list[str] = []
|
101 |
+
self._prompt = react_prompt
|
102 |
+
|
103 |
+
@classmethod
|
104 |
+
def add_method(cls, func):
|
105 |
+
setattr(cls, func.__name__, func)
|
106 |
+
|
107 |
+
@staticmethod
|
108 |
+
def clean(text: str):
|
109 |
+
text = text.replace("\n", " ")
|
110 |
+
return text
|
111 |
+
|
112 |
+
def search(self, query: str):
|
113 |
+
observation = None
|
114 |
+
query = query.strip()
|
115 |
+
try:
|
116 |
+
observation = wikipedia.summary(query, sentences=4, auto_suggest=False)
|
117 |
+
wiki_url = wikipedia.page(query, auto_suggest=False).url
|
118 |
+
observation = self.clean(observation)
|
119 |
+
self._search_history.append(query)
|
120 |
+
self._search_urls.append(wiki_url)
|
121 |
+
except (DisambiguationError, PageError) as e:
|
122 |
+
observation = f'Could not find ["{query}"]. Similar: {wikipedia.search(query)}.'
|
123 |
+
return observation
|
124 |
+
|
125 |
+
def lookup(self, phrase: str, context_length=200):
|
126 |
+
page = wikipedia.page(self._search_history[-1], auto_suggest=False).content
|
127 |
+
page = self.clean(page)
|
128 |
+
start_index = page.find(phrase)
|
129 |
+
result = page[max(0, start_index - context_length):start_index+len(phrase)+context_length]
|
130 |
+
return result
|
131 |
+
|
132 |
+
def finish(self, _):
|
133 |
+
self.should_continue_prompting = False
|
134 |
+
|
135 |
+
def __call__(self, user_question, max_calls: int=8, **generation_kwargs):
|
136 |
+
if len(self.chat.history) == 0:
|
137 |
+
model_prompt = self._prompt.format(question=user_question)
|
138 |
+
else:
|
139 |
+
model_prompt = user_question
|
140 |
+
|
141 |
+
callable_entities = ['', '', '']
|
142 |
+
generation_kwargs.update({'stop_sequences': callable_entities})
|
143 |
+
|
144 |
+
self.should_continue_prompting = True
|
145 |
+
for idx in range(max_calls):
|
146 |
+
self.response = self.chat.send_message(content=[model_prompt],
|
147 |
+
generation_config=generation_kwargs, stream=False)
|
148 |
+
response_cmd = self.chat.history[-1].parts[-1].text
|
149 |
+
try:
|
150 |
+
cmd = re.findall(r'<(.*)>', response_cmd)[-1]
|
151 |
+
query = response_cmd.split(f'<{cmd}>')[-1].strip()
|
152 |
+
observation = self.__getattribute__(cmd)(query)
|
153 |
+
|
154 |
+
if not self.should_continue_prompting:
|
155 |
+
break
|
156 |
+
|
157 |
+
model_prompt = f"\nObservation {idx + 1}\n{observation}"
|
158 |
+
|
159 |
+
except (IndexError, AttributeError) as e:
|
160 |
+
model_prompt = "Please try to generate thought-action-observation traces."
|
161 |
+
|
162 |
+
# Initialize ReAct with model and instructions
|
163 |
+
react_agent = ReAct(model=react_model, react_prompt=model_instructions)
|
164 |
+
|
165 |
+
search_history = set()
|
166 |
wikipedia_info = []
|
|
|
|
|
167 |
for query in keywords:
|
168 |
+
# Use ReAct to search and extract information
|
169 |
+
react_agent(query)
|
170 |
+
response_text = react_agent.response.text
|
171 |
+
|
172 |
+
# Process response_text to extract information
|
173 |
+
observations = []
|
174 |
+
for line in response_text.strip().split('\n'):
|
175 |
+
if line.startswith("Observation"):
|
176 |
+
observations.append(line.split(':')[-1].strip())
|
177 |
+
|
178 |
+
# Assuming the last observation is the final answer/summary
|
179 |
+
summary = observations[-1]
|
180 |
+
|
181 |
+
# Get URL from search history (assuming successful search)
|
182 |
+
url = react_agent._search_urls[-1]
|
183 |
+
|
184 |
+
# Create a dictionary with the extracted information
|
185 |
+
wikipedia_info.append({
|
186 |
+
"topic": query, # Using the original query as the topic
|
187 |
+
"summary": summary,
|
188 |
+
"url": url
|
189 |
+
# "additional_sources": # Add this if you implement additional source extraction
|
190 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
return wikipedia_info
|
193 |
|